diff --git a/.asf.yaml b/.asf.yaml index 83a80db5ff6353..821947fa1a0451 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -55,6 +55,7 @@ github: - P1 Regression (Doris Regression) - External Regression (Doris External Regression) - cloud_p1 (Doris Cloud Regression) + - cloud_p0 (Doris Cloud Regression) - FE UT (Doris FE UT) - BE UT (Doris BE UT) - Build Broker diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 15b12fc843eaff..2757578827c2bb 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,3 +16,4 @@ # be/src/io/* @platoneko @gavinchou @dataroaring fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @dataroaring @CalvinKirs @morningman +**/pom.xml @CalvinKirs @morningman diff --git a/.github/workflows/comment-to-trigger-teamcity.yml b/.github/workflows/comment-to-trigger-teamcity.yml index bd3e29dbb0c8e8..5adadaba205d6e 100644 --- a/.github/workflows/comment-to-trigger-teamcity.yml +++ b/.github/workflows/comment-to-trigger-teamcity.yml @@ -150,6 +150,7 @@ jobs: set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.1'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_fe_ut }}" \ @@ -158,7 +159,7 @@ jobs: "feut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not in (master, branch-2.0, branch-2.1), skip run feut" + echo "PR target branch is not in (master, branch-2.0, branch-2.1, branch-3.0), skip run feut" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -175,6 +176,7 @@ jobs: set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.1'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_be_ut }}" \ @@ -183,7 +185,7 @@ jobs: "beut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not in (master, branch-2.0, branch-2.1), skip run beut" + echo "PR target branch is not in (master, branch-2.0, branch-2.1, branch-3.0), skip run beut" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -197,7 +199,8 @@ jobs: run: | source ./regression-test/pipeline/common/teamcity-utils.sh set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_ut }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -205,7 +208,7 @@ jobs: "cloudut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not master, skip run cloudut" + echo "PR target branch is not in (master, branch-3.0), skip run cloudut" fi - name: "Trigger or Skip compile" @@ -279,8 +282,9 @@ jobs: echo "COMMENT_TRIGGER_TYPE is buildall, trigger compile is enough, compile will trigger cloud_p0" && exit fi set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then - echo "PR target branch in (master), need run cloud_p0" + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then + echo "PR target branch is in (master, branch-3.0), need run cloud_p0" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_p0 }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -288,7 +292,7 @@ jobs: "cloud_p0" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master), skip run cloud_p0" + echo "PR target branch is not in (master, branch-3.0), skip run cloud_p0" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -305,8 +309,9 @@ jobs: echo "COMMENT_TRIGGER_TYPE is buildall, trigger compile is enough, compile will trigger cloud_p1" && exit fi set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then - echo "PR target branch in (master), need run cloud_p1" + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then + echo "PR target branch is in (master, branch-3.0), need run cloud_p1" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_p1 }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -314,7 +319,7 @@ jobs: "cloud_p1" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master), skip run cloud_p1" + echo "PR target branch is not in (master, branch-3.0), skip run cloud_p1" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -341,8 +346,9 @@ jobs: source ./regression-test/pipeline/common/teamcity-utils.sh set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" ]]; then - echo "PR target branch in (master, branch-2.0), need run performance" + echo "PR target branch is in (master, branch-2.0, branch-3.0), need run performance" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_performance }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -350,7 +356,7 @@ jobs: "performance" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master, branch-2.0), skip run performance" + echo "PR target branch is not in (master, branch-2.0, branch-3.0), skip run performance" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ diff --git a/.github/workflows/labeler/scope-label-conf.yml b/.github/workflows/labeler/scope-label-conf.yml index 5a88d046a630ba..2afd31bc1ef7bd 100644 --- a/.github/workflows/labeler/scope-label-conf.yml +++ b/.github/workflows/labeler/scope-label-conf.yml @@ -23,3 +23,6 @@ meta-change: - fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java - gensrc/thrift/* - gensrc/proto/* + +doing: + - '**' diff --git a/.github/workflows/lfs-warning.yml b/.github/workflows/lfs-warning.yml new file mode 100644 index 00000000000000..4db79ceded9eea --- /dev/null +++ b/.github/workflows/lfs-warning.yml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: 'Check Large File' + +on: [push, pull_request_target] + +jobs: + large-file-checker: + name: "Check large file" + runs-on: ubuntu-latest + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v3 + with: + persist-credentials: false + submodules: recursive + + - name: "Checkout lfs-warning commit" + run: | + rm -rf ./.github/actions/lfs-warning + git clone https://github.com/ppremk/lfs-warning .github/actions/lfs-warning + pushd .github/actions/lfs-warning &>/dev/null + git checkout 4b98a8a5e6c429c23c34eee02d71553bca216425 + popd &>/dev/null + + - name: "Check Large File" + uses: ./.github/actions/lfs-warning + with: + token: ${{ secrets.GITHUB_TOKEN }} + filesizelimit: 1MB + diff --git a/README.md b/README.md index 0630ce6389b25f..c999651ddee68d 100644 --- a/README.md +++ b/README.md @@ -24,17 +24,12 @@ under the License. [![License](https://img.shields.io/badge/license-Apache%202-4EB1BA.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![GitHub release](https://img.shields.io/github/release/apache/doris.svg)](https://github.com/apache/doris/releases) [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/516)](https://ossrank.com/p/516) -[![Jenkins Vec](https://img.shields.io/jenkins/tests?compact_message&jobUrl=https://ci-builds.apache.org/job/Doris/job/doris_daily_enable_vectorized&label=VectorizedEngine)](https://ci-builds.apache.org/job/Doris/job/doris_daily_enable_vectorized) -[![Total Line](https://img.shields.io/badge/Total_Line-GitHub-blue)]((https://github.com/apache/doris)) -[![Join the chat at https://gitter.im/apache-doris/Lobby](https://badges.gitter.im/apache-doris/Lobby.svg)](https://gitter.im/apache-doris/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Commit activity](https://img.shields.io/github/commit-activity/m/apache/doris)](https://github.com/apache/doris/commits/master/) [![EN doc](https://img.shields.io/badge/Docs-English-blue.svg)](https://doris.apache.org/docs/get-starting/quick-start) [![CN doc](https://img.shields.io/badge/文档-中文版-blue.svg)](https://doris.apache.org/zh-CN/docs/get-starting/quick-start/) - -
- [![Official Website]()](https://doris.apache.org/) [![Quick Download]()](https://doris.apache.org/download) @@ -64,10 +59,10 @@ Apache Doris is an easy-to-use, high-performance and real-time analytical databa All this makes Apache Doris an ideal tool for scenarios including report analysis, ad-hoc query, unified data warehouse, and data lake query acceleration. On Apache Doris, users can build various applications, such as user behavior analysis, AB test platform, log retrieval analysis, user portrait analysis, and order analysis. -🎉 Version 2.1.0 released now. Check out the 🔗[Release Notes](https://doris.apache.org/docs/releasenotes/release-2.1.0) here. The 2.1 verison delivers exceptional performance with 100% higher out-of-the-box queries proven by TPC-DS 1TB tests, enhanced data lake analytics that are 4-6 times speedier than Trino and Spark, solid support for semi-structured data analysis with new Variant types and suite of analytical functions, asynchronous materialized views for query acceleration, optimized real-time writing at scale, and better workload management with stability and runtime SQL resource tracking. +🎉 Version 2.1.4 released now. Check out the 🔗[Release Notes](https://doris.apache.org/docs/releasenotes/release-2.1.4) here. The 2.1 verison delivers exceptional performance with 100% higher out-of-the-box queries proven by TPC-DS 1TB tests, enhanced data lake analytics that are 4-6 times speedier than Trino and Spark, solid support for semi-structured data analysis with new Variant types and suite of analytical functions, asynchronous materialized views for query acceleration, optimized real-time writing at scale, and better workload management with stability and runtime SQL resource tracking. -🎉 Version 2.0.6 is now released ! This fully evolved and stable release is ready for all users to upgrade. Check out the 🔗[Release Notes](https://doris.apache.org/docs/releasenotes/release-2.0.6) here. +🎉 Version 2.0.12 is now released ! This fully evolved and stable release is ready for all users to upgrade. Check out the 🔗[Release Notes](https://doris.apache.org/docs/2.0/releasenotes/release-2.0.12) here. 👀 Have a look at the 🔗[Official Website](https://doris.apache.org/) for a comprehensive list of Apache Doris's core features, blogs and user cases. diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 51cf86fe4083cf..f554ba6053a5e6 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -157,11 +157,20 @@ set(BOOST_VERSION "1.81.0") if (NOT APPLE) find_package(Boost ${BOOST_VERSION} REQUIRED COMPONENTS system date_time) + find_package(Boost ${BOOST_VERSION} REQUIRED COMPONENTS system container) else() find_package(Boost ${BOOST_VERSION} COMPONENTS system date_time) find_package(Boost ${BOOST_VERSION} COMPONENTS system container) endif() +# Set if use libazure or not +option(BUILD_AZURE "ON for building azure support for BE or OFF for not" OFF) +message(STATUS "build azure: ${BUILD_AZURE}") +if(BUILD_AZURE STREQUAL "ON") + add_definitions(-DUSE_AZURE) +endif() + + set(GPERFTOOLS_HOME "${THIRDPARTY_DIR}/gperftools") include (cmake/thirdparty.cmake) @@ -290,12 +299,11 @@ if (COMPILER_CLANG) -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-sign-conversion + -Wno-missing-field-initializers + -Wno-unused-const-variable -Wno-shorten-64-to-32) if (USE_LIBCPP) add_compile_options($<$:-stdlib=libc++>) - if (NOT OS_MACOSX) - add_compile_options($<$:-lstdc++>) - endif() add_definitions(-DUSE_LIBCPP) endif() endif () @@ -358,10 +366,6 @@ if (USE_UNWIND) endif() endif() -if (ENABLE_STACKTRACE) - add_definitions(-DENABLE_STACKTRACE) -endif() - if (USE_DWARF) add_compile_options(-gdwarf-5) endif() @@ -369,29 +373,29 @@ endif() # For CMAKE_BUILD_TYPE=Debug if (OS_MACOSX AND ARCH_ARM) # Using -O0 may meet ARM64 branch out of range errors when linking with tcmalloc. - set(CXX_FLAGS_DEBUG "${CXX_GCC_FLAGS} -Og") + set(CXX_FLAGS_DEBUG "-Og") else() - set(CXX_FLAGS_DEBUG "${CXX_GCC_FLAGS} -O0") + set(CXX_FLAGS_DEBUG "-O0") endif() # For CMAKE_BUILD_TYPE=Release # -O3: Enable all compiler optimizations # -DNDEBUG: Turn off dchecks/asserts/debug only code. -set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG") -set(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -fsanitize=undefined -fno-strict-aliasing -fno-sanitize=alignment,signed-integer-overflow,float-cast-overflow -DUNDEFINED_BEHAVIOR_SANITIZER -DADDRESS_SANITIZER") -set(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER") +set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG") +set(CXX_FLAGS_ASAN "-O0 -fsanitize=address -fsanitize=undefined -fno-strict-aliasing -fno-sanitize=alignment,signed-integer-overflow,float-cast-overflow -DUNDEFINED_BEHAVIOR_SANITIZER -DADDRESS_SANITIZER") +set(CXX_FLAGS_LSAN "-O0 -fsanitize=leak -DLEAK_SANITIZER") ## Use for BE-UT -set(CXX_FLAGS_ASAN_UT "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER") +set(CXX_FLAGS_ASAN_UT "-O0 -fsanitize=address -DADDRESS_SANITIZER") # Set the flags to the undefined behavior sanitizer, also known as "ubsan" # Turn on sanitizer and debug symbols to get stack traces: -set(CXX_FLAGS_UBSAN "${CXX_GCC_FLAGS} -O0 -fno-wrapv -mcmodel=medium -fsanitize=undefined -DUNDEFINED_BEHAVIOR_SANITIZER") +set(CXX_FLAGS_UBSAN "-O0 -fno-wrapv -mcmodel=medium -fsanitize=undefined -DUNDEFINED_BEHAVIOR_SANITIZER") # Set the flags to the thread sanitizer, also known as "tsan" # Turn on sanitizer and debug symbols to get stack traces: # Use -Wno-builtin-declaration-mismatch to mute warnings like "new declaration ‘__tsan_atomic16 __tsan_atomic16_fetch_nand(..." # If use -O0 to compile, BE will stack overflow when start. https://github.com/apache/doris/issues/8868 -set(CXX_FLAGS_TSAN "${CXX_GCC_FLAGS} -O1 -fsanitize=thread -DTHREAD_SANITIZER -Wno-missing-declarations") +set(CXX_FLAGS_TSAN "-O1 -fsanitize=thread -DTHREAD_SANITIZER -Wno-missing-declarations") # Set compile flags based on the build type. if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") @@ -513,6 +517,7 @@ find_package(absl) # add it here first. set(COMMON_THIRDPARTY Boost::date_time + Boost::container ${COMMON_THIRDPARTY} ) @@ -555,7 +560,6 @@ endif() if (OS_MACOSX) set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} - Boost::container bfd iberty intl @@ -599,9 +603,11 @@ if (NOT OS_MACOSX) ${DORIS_DEPENDENCIES} -static-libstdc++ -static-libgcc - -lstdc++fs -lresolv ) + if (NOT (USE_LIBCPP AND COMPILER_CLANG)) + set(DORIS_LINK_LIBS ${DORIS_LINK_LIBS} -lstdc++fs) + endif() else() set(DORIS_LINK_LIBS ${DORIS_LINK_LIBS} diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake index 502a22bbdf4532..19f2a00012aafb 100644 --- a/be/cmake/thirdparty.cmake +++ b/be/cmake/thirdparty.cmake @@ -141,10 +141,12 @@ if (NOT OS_MACOSX) add_thirdparty(aws-s2n LIBNAME "lib/libs2n.a") endif() -add_thirdparty(azure-core) -add_thirdparty(azure-identity) -add_thirdparty(azure-storage-blobs) -add_thirdparty(azure-storage-common) +if(BUILD_AZURE STREQUAL "ON") + add_thirdparty(azure-core) + add_thirdparty(azure-identity) + add_thirdparty(azure-storage-blobs) + add_thirdparty(azure-storage-common) +endif() add_thirdparty(minizip LIB64) add_thirdparty(simdjson LIB64) diff --git a/be/src/agent/be_exec_version_manager.h b/be/src/agent/be_exec_version_manager.h index ec6ddf497ec084..a55e26f7ba4493 100644 --- a/be/src/agent/be_exec_version_manager.h +++ b/be/src/agent/be_exec_version_manager.h @@ -80,8 +80,9 @@ class BeExecVersionManager { * b. clear old version of version 3->4 * c. change FunctionIsIPAddressInRange from AlwaysNotNullable to DependOnArguments * d. change some agg function nullable property: PR #37215 + * e. change variant serde to fix PR #38413 */ -constexpr inline int BeExecVersionManager::max_be_exec_version = 5; +constexpr inline int BeExecVersionManager::max_be_exec_version = 6; constexpr inline int BeExecVersionManager::min_be_exec_version = 0; /// functional @@ -89,5 +90,6 @@ constexpr inline int BITMAP_SERDE = 3; constexpr inline int USE_NEW_SERDE = 4; // release on DORIS version 2.1 constexpr inline int OLD_WAL_SERDE = 3; // use to solve compatibility issues, see pr #32299 constexpr inline int AGG_FUNCTION_NULLABLE = 5; // change some agg nullable property: PR #37215 +constexpr inline int VARIANT_SERDE = 6; // change variant serde to fix PR #38413 } // namespace doris diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 0e851fba17a035..7bbd602f571ede 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -103,6 +104,10 @@ std::unordered_map> s_task_signatur std::atomic_ulong s_report_version(time(nullptr) * 10000); +void increase_report_version() { + s_report_version.fetch_add(1, std::memory_order_relaxed); +} + // FIXME(plat1ko): Paired register and remove task info bool register_task_info(const TTaskType::type task_type, int64_t signature) { if (task_type == TTaskType::type::PUSH_STORAGE_POLICY || @@ -213,7 +218,7 @@ void alter_tablet(StorageEngine& engine, const TAgentTaskRequest& agent_task_req } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -289,7 +294,7 @@ void alter_cloud_tablet(CloudStorageEngine& engine, const TAgentTaskRequest& age } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -1384,9 +1389,12 @@ void update_s3_resource(const TStorageResource& param, io::RemoteFileSystemSPtr auto client = static_cast(existed_fs.get())->client_holder(); auto new_s3_conf = S3Conf::get_s3_conf(param.s3_storage_param); S3ClientConf conf { + .endpoint {}, + .region {}, .ak = std::move(new_s3_conf.client_conf.ak), .sk = std::move(new_s3_conf.client_conf.sk), .token = std::move(new_s3_conf.client_conf.token), + .bucket {}, .provider = new_s3_conf.client_conf.provider, }; st = client->reset(conf); @@ -1530,7 +1538,7 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) .tag("tablet_id", create_tablet_req.tablet_id) .error(status); } else { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); // get path hash of the created tablet TabletSharedPtr tablet; { @@ -1625,7 +1633,7 @@ void push_callback(StorageEngine& engine, const TAgentTaskRequest& req) { .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); finish_task_request.__set_finish_tablet_infos(tablet_infos); } else { LOG_WARNING("failed to execute push task") @@ -1671,7 +1679,7 @@ void cloud_push_callback(CloudStorageEngine& engine, const TAgentTaskRequest& re .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); auto& tablet_info = finish_task_request.finish_tablet_infos.emplace_back(); // Just need tablet_id tablet_info.tablet_id = push_req.tablet_id; @@ -1789,7 +1797,7 @@ void PublishVersionWorkerPool::publish_version_callback(const TAgentTaskRequest& if (tablet->exceed_version_limit(config::max_tablet_version_num * 2 / 3) && published_count % 20 == 0) { auto st = _engine.submit_compaction_task( - tablet, CompactionType::CUMULATIVE_COMPACTION, true); + tablet, CompactionType::CUMULATIVE_COMPACTION, true, false); if (!st.ok()) [[unlikely]] { LOG(WARNING) << "trigger compaction failed, tablet_id=" << tablet_id << ", published=" << published_count << " : " << st; @@ -1968,6 +1976,10 @@ void clone_callback(StorageEngine& engine, const TMasterInfo& master_info, LOG_INFO("successfully clone tablet") .tag("signature", req.signature) .tag("tablet_id", clone_req.tablet_id); + if (engine_task.is_new_tablet()) { + increase_report_version(); + finish_task_request.__set_report_version(s_report_version); + } finish_task_request.__set_finish_tablet_infos(tablet_infos); } diff --git a/be/src/agent/workload_group_listener.cpp b/be/src/agent/workload_group_listener.cpp index 61af4543196ecc..f0f57869f2545a 100644 --- a/be/src/agent/workload_group_listener.cpp +++ b/be/src/agent/workload_group_listener.cpp @@ -61,6 +61,9 @@ void WorkloadGroupListener::handle_topic_info(const std::vector& topi // 4 create and update task scheduler wg->upsert_task_scheduler(&workload_group_info, _exec_env); + // 5 upsert io throttle + wg->upsert_scan_io_throttle(&workload_group_info); + LOG(INFO) << "[topic_publish_wg]update workload group finish, wg info=" << wg->debug_string() << ", enable_cpu_hard_limit=" << (_exec_env->workload_group_mgr()->enable_cpu_hard_limit() ? "true" : "false") diff --git a/be/src/cloud/cloud_backend_service.cpp b/be/src/cloud/cloud_backend_service.cpp index f576b60045d54d..d91e9e416b81a1 100644 --- a/be/src/cloud/cloud_backend_service.cpp +++ b/be/src/cloud/cloud_backend_service.cpp @@ -29,6 +29,8 @@ #include "common/status.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "runtime/stream_load/stream_load_context.h" +#include "runtime/stream_load/stream_load_recorder.h" #include "util/brpc_client_cache.h" // BrpcClientCache #include "util/thrift_server.h" @@ -186,4 +188,10 @@ void CloudBackendService::check_warm_up_cache_async(TCheckWarmUpCacheAsyncRespon response.status = t_status; } +void CloudBackendService::get_stream_load_record(TStreamLoadRecordResult& result, + int64_t last_stream_record_time) { + BaseBackendService::get_stream_load_record(result, last_stream_record_time, + _engine.get_stream_load_recorder()); +} + } // namespace doris diff --git a/be/src/cloud/cloud_backend_service.h b/be/src/cloud/cloud_backend_service.h index 88f0099fe73f09..358cb4d1f0b2ec 100644 --- a/be/src/cloud/cloud_backend_service.h +++ b/be/src/cloud/cloud_backend_service.h @@ -53,6 +53,9 @@ class CloudBackendService final : public BaseBackendService { void check_warm_up_cache_async(TCheckWarmUpCacheAsyncResponse& response, const TCheckWarmUpCacheAsyncRequest& request) override; + void get_stream_load_record(TStreamLoadRecordResult& result, + int64_t last_stream_record_time) override; + private: CloudStorageEngine& _engine; }; diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.cpp b/be/src/cloud/cloud_cumulative_compaction_policy.cpp index fc56f971cad522..b8c4ee20cb2077 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.cpp +++ b/be/src/cloud/cloud_cumulative_compaction_policy.cpp @@ -268,6 +268,10 @@ int32_t CloudTimeSeriesCumulativeCompactionPolicy::pick_input_rowsets( continue; } return transient_size; + } else if ( + *compaction_score >= + config::compaction_max_rowset_count) { // If the number of rowsets is too large: FDB_ERROR_CODE_TXN_TOO_LARGE + return transient_size; } } diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index b1b455d2007e1f..12cfbb7c0035fd 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -38,6 +38,7 @@ #include "cloud/cloud_warm_up_manager.h" #include "cloud/config.h" #include "io/cache/block_file_cache_downloader.h" +#include "io/cache/block_file_cache_factory.h" #include "io/cache/file_cache_common.h" #include "io/fs/file_system.h" #include "io/fs/hdfs_file_system.h" @@ -48,6 +49,7 @@ #include "olap/memtable_flush_executor.h" #include "olap/storage_policy.h" #include "runtime/memory/cache_manager.h" +#include "util/parse_util.h" namespace doris { @@ -180,14 +182,21 @@ Status CloudStorageEngine::open() { // TODO(plat1ko): DeleteBitmapTxnManager _memtable_flush_executor = std::make_unique(); - // TODO(plat1ko): Use file cache disks number? - _memtable_flush_executor->init(1); + // Use file cache disks number + _memtable_flush_executor->init(io::FileCacheFactory::instance()->get_cache_instance_size()); _calc_delete_bitmap_executor = std::make_unique(); _calc_delete_bitmap_executor->init(); - _txn_delete_bitmap_cache = - std::make_unique(config::delete_bitmap_agg_cache_capacity); + // The default cache is set to 100MB, use memory limit to dynamic adjustment + bool is_percent = false; + int64_t delete_bitmap_agg_cache_cache_limit = + ParseUtil::parse_mem_spec(config::delete_bitmap_dynamic_agg_cache_limit, + MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent); + _txn_delete_bitmap_cache = std::make_unique( + delete_bitmap_agg_cache_cache_limit > config::delete_bitmap_agg_cache_capacity + ? delete_bitmap_agg_cache_cache_limit + : config::delete_bitmap_agg_cache_capacity); RETURN_IF_ERROR(_txn_delete_bitmap_cache->init()); _file_cache_block_downloader = std::make_unique(*this); @@ -196,6 +205,10 @@ Status CloudStorageEngine::open() { _tablet_hotspot = std::make_unique(); + RETURN_NOT_OK_STATUS_WITH_WARN( + init_stream_load_recorder(ExecEnv::GetInstance()->store_paths()[0].path), + "init StreamLoadRecorder failed"); + return ThreadPoolBuilder("SyncLoadForTabletsThreadPool") .set_max_threads(config::sync_load_for_tablets_thread) .set_min_threads(config::sync_load_for_tablets_thread) diff --git a/be/src/cloud/cloud_stream_load_executor.cpp b/be/src/cloud/cloud_stream_load_executor.cpp index a87f37a5188600..1b8167c96ebd48 100644 --- a/be/src/cloud/cloud_stream_load_executor.cpp +++ b/be/src/cloud/cloud_stream_load_executor.cpp @@ -60,7 +60,10 @@ Status CloudStreamLoadExecutor::operate_txn_2pc(StreamLoadContext* ctx) { Status st = Status::InternalError("impossible branch reached, " + op_info); if (ctx->txn_operation.compare("commit") == 0) { - if (topt == TxnOpParamType::WITH_TXN_ID) { + if (!config::enable_stream_load_commit_txn_on_be) { + VLOG_DEBUG << "2pc commit stream load txn with FE support: " << op_info; + st = StreamLoadExecutor::operate_txn_2pc(ctx); + } else if (topt == TxnOpParamType::WITH_TXN_ID) { VLOG_DEBUG << "2pc commit stream load txn directly: " << op_info; st = _exec_env->storage_engine().to_cloud().meta_mgr().commit_txn(*ctx, true); } else if (topt == TxnOpParamType::WITH_LABEL) { @@ -93,12 +96,9 @@ Status CloudStreamLoadExecutor::operate_txn_2pc(StreamLoadContext* ctx) { } Status CloudStreamLoadExecutor::commit_txn(StreamLoadContext* ctx) { - if (ctx->load_type == TLoadType::ROUTINE_LOAD) { - return StreamLoadExecutor::commit_txn(ctx); - } - // forward to fe to excute commit transaction for MoW table - if (ctx->is_mow_table()) { + if (ctx->is_mow_table() || !config::enable_stream_load_commit_txn_on_be || + ctx->load_type == TLoadType::ROUTINE_LOAD) { Status st; int retry_times = 0; while (retry_times < config::mow_stream_load_commit_retry_times) { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index d2596d8a7d2d0c..17ec1fe22b0d85 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -26,7 +26,9 @@ #include #include +#include #include +#include #include "cloud/cloud_meta_mgr.h" #include "cloud/cloud_storage_engine.h" @@ -42,8 +44,10 @@ #include "olap/rowset/rowset_writer.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/storage_policy.h" +#include "olap/tablet_schema.h" #include "olap/txn_manager.h" #include "util/debug_points.h" +#include "vec/common/schema_util.h" namespace doris { using namespace ErrorCode; @@ -131,6 +135,19 @@ Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) return st; } +TabletSchemaSPtr CloudTablet::merged_tablet_schema() const { + std::shared_lock rdlock(_meta_lock); + TabletSchemaSPtr target_schema; + std::vector schemas; + for (const auto& [_, rowset] : _rs_version_map) { + schemas.push_back(rowset->tablet_schema()); + } + // get the max version schema and merge all schema + static_cast( + vectorized::schema_util::get_least_common_schema(schemas, nullptr, target_schema)); + return target_schema; +} + // Sync tablet meta and all rowset meta if not running. // This could happen when BE didn't finish schema change job and another BE committed this schema change job. // It should be a quite rare situation. @@ -227,6 +244,7 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ { .expiration_time = expiration_time, }, + .download_done {}, }); } #endif @@ -463,6 +481,7 @@ int64_t CloudTablet::get_cloud_base_compaction_score() const { if (_tablet_meta->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { bool has_delete = false; int64_t point = cumulative_layer_point(); + std::shared_lock rlock(_meta_lock); for (const auto& rs_meta : _tablet_meta->all_rs_metas()) { if (rs_meta->start_version() >= point) { continue; diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index ca05759cdbf83e..10ff1835e6c830 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -147,18 +147,6 @@ class CloudTablet final : public BaseTablet { std::vector pick_candidate_rowsets_to_base_compaction(); - void traverse_rowsets(std::function visitor, - bool include_stale = false) { - std::shared_lock rlock(_meta_lock); - for (auto& [v, rs] : _rs_version_map) { - visitor(rs); - } - if (!include_stale) return; - for (auto& [v, rs] : _stale_rs_version_map) { - visitor(rs); - } - } - inline Version max_version() const { std::shared_lock rdlock(_meta_lock); return _tablet_meta->max_version(); @@ -206,6 +194,9 @@ class CloudTablet final : public BaseTablet { int64_t last_cumu_compaction_success_time_ms = 0; int64_t last_cumu_no_suitable_version_ms = 0; + // Return merged extended schema + TabletSchemaSPtr merged_tablet_schema() const override; + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); diff --git a/be/src/cloud/cloud_tablet_hotspot.cpp b/be/src/cloud/cloud_tablet_hotspot.cpp index ae8b3a54d2b6cf..dd197268646fbc 100644 --- a/be/src/cloud/cloud_tablet_hotspot.cpp +++ b/be/src/cloud/cloud_tablet_hotspot.cpp @@ -89,20 +89,20 @@ void TabletHotspot::get_top_n_hot_partition(std::vector* hot_t hot_partition.qpd = std::max(hot_partition.qpd, counter->qpd()); hot_partition.qpw = std::max(hot_partition.qpw, counter->qpw()); hot_partition.last_access_time = - std::max(hot_partition.last_access_time, - std::chrono::duration_cast( - counter->last_access_time.time_since_epoch()) - .count()); + std::max(hot_partition.last_access_time, + std::chrono::duration_cast( + counter->last_access_time.time_since_epoch()) + .count()); } else if (counter->qpw() != 0) { auto& hot_partition = week_hot_partitions[std::make_pair( counter->table_id, counter->index_id)][counter->partition_id]; hot_partition.qpd = 0; hot_partition.qpw = std::max(hot_partition.qpw, counter->qpw()); hot_partition.last_access_time = - std::max(hot_partition.last_access_time, - std::chrono::duration_cast( - counter->last_access_time.time_since_epoch()) - .count()); + std::max(hot_partition.last_access_time, + std::chrono::duration_cast( + counter->last_access_time.time_since_epoch()) + .count()); } } }); diff --git a/be/src/cloud/cloud_tablets_channel.cpp b/be/src/cloud/cloud_tablets_channel.cpp index e063ab68116bb2..85b8e3ea33a865 100644 --- a/be/src/cloud/cloud_tablets_channel.cpp +++ b/be/src/cloud/cloud_tablets_channel.cpp @@ -59,15 +59,20 @@ Status CloudTabletsChannel::add_batch(const PTabletWriterAddBlockRequest& reques _build_tablet_to_rowidxs(request, &tablet_to_rowidxs); std::unordered_set partition_ids; - for (auto& [tablet_id, _] : tablet_to_rowidxs) { - auto tablet_writer_it = _tablet_writers.find(tablet_id); - if (tablet_writer_it == _tablet_writers.end()) { - return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); + { + // add_batch may concurrency with inc_open but not under _lock. + // so need to protect it with _tablet_writers_lock. + std::lock_guard l(_tablet_writers_lock); + for (auto& [tablet_id, _] : tablet_to_rowidxs) { + auto tablet_writer_it = _tablet_writers.find(tablet_id); + if (tablet_writer_it == _tablet_writers.end()) { + return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); + } + partition_ids.insert(tablet_writer_it->second->partition_id()); + } + if (!partition_ids.empty()) { + RETURN_IF_ERROR(_init_writers_by_partition_ids(partition_ids)); } - partition_ids.insert(tablet_writer_it->second->partition_id()); - } - if (!partition_ids.empty()) { - RETURN_IF_ERROR(_init_writers_by_partition_ids(partition_ids)); } return _write_block_data(request, cur_seq, tablet_to_rowidxs, response); @@ -124,7 +129,7 @@ Status CloudTabletsChannel::close(LoadChannel* parent, const PTabletWriterAddBlo _state = kFinished; // All senders are closed - // 1. close all delta writers + // 1. close all delta writers. under _lock. std::vector writers_to_commit; writers_to_commit.reserve(_tablet_writers.size()); bool success = true; diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index 80522759b84b44..82c466120e94fb 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -35,7 +35,7 @@ DEFINE_Int64(tablet_cache_shards, "16"); DEFINE_mInt32(tablet_sync_interval_s, "1800"); DEFINE_mInt64(min_compaction_failure_interval_ms, "5000"); -DEFINE_mInt64(base_compaction_freeze_interval_s, "86400"); +DEFINE_mInt64(base_compaction_freeze_interval_s, "7200"); DEFINE_mInt64(cu_compaction_freeze_interval_s, "1200"); DEFINE_mInt64(cumu_compaction_interval_s, "1800"); @@ -48,6 +48,7 @@ DEFINE_mDouble(cumu_compaction_thread_num_factor, "0.5"); DEFINE_mInt32(check_auto_compaction_interval_seconds, "5"); DEFINE_mInt32(max_base_compaction_task_num_per_disk, "2"); DEFINE_mBool(prioritize_query_perf_in_compaction, "false"); +DEFINE_mInt32(compaction_max_rowset_count, "10000"); DEFINE_mInt32(refresh_s3_info_interval_s, "60"); DEFINE_mInt32(vacuum_stale_rowsets_interval_s, "300"); diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index bf041ba0fa6fc5..02e7014801e566 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -79,6 +79,7 @@ DECLARE_mDouble(cumu_compaction_thread_num_factor); DECLARE_mInt32(check_auto_compaction_interval_seconds); DECLARE_mInt32(max_base_compaction_task_num_per_disk); DECLARE_mBool(prioritize_query_perf_in_compaction); +DECLARE_mInt32(compaction_max_rowset_count); // CloudStorageEngine config DECLARE_mInt32(refresh_s3_info_interval_s); diff --git a/be/src/clucene b/be/src/clucene index 5db9db68e448b8..fdbf2204031128 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 5db9db68e448b8ccfd360d02666bbac44e6f8d1a +Subproject commit fdbf2204031128b2bd8505fc73c06403b7c1a815 diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 28c19df70bcb80..a68b4c60c01ad6 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -135,9 +135,9 @@ DEFINE_mBool(enable_query_memory_overcommit, "true"); DEFINE_mBool(disable_memory_gc, "false"); -DEFINE_mBool(enable_stacktrace_in_allocator_check_failed, "false"); +DEFINE_mBool(enable_stacktrace, "true"); -DEFINE_mInt64(large_memory_check_bytes, "2147483648"); +DEFINE_mInt64(stacktrace_in_alloc_large_memory_bytes, "2147483648"); DEFINE_mBool(enable_memory_orphan_check, "false"); @@ -248,6 +248,7 @@ DEFINE_Validator(doris_scanner_thread_pool_thread_num, [](const int config) -> b } return true; }); +DEFINE_Int32(doris_scanner_min_thread_pool_thread_num, "8"); DEFINE_Int32(remote_split_source_batch_size, "10240"); DEFINE_Int32(doris_max_remote_scanner_thread_pool_thread_num, "-1"); // number of olap scanner thread pool queue size @@ -482,6 +483,8 @@ DEFINE_mInt32(migration_remaining_size_threshold_mb, "10"); // If the task runs longer than this time, the task will be terminated, in seconds. // timeout = std::max(migration_task_timeout_secs, tablet size / 1MB/s) DEFINE_mInt32(migration_task_timeout_secs, "300"); +// timeout for try_lock migration lock +DEFINE_Int64(migration_lock_timeout_ms, "1000"); // Port to start debug webserver on DEFINE_Int32(webserver_port, "8040"); @@ -539,6 +542,8 @@ DEFINE_mInt32(stream_load_record_batch_size, "50"); DEFINE_Int32(stream_load_record_expire_time_secs, "28800"); // time interval to clean expired stream load records DEFINE_mInt64(clean_stream_load_record_interval_secs, "1800"); +// enable stream load commit txn on BE directly, bypassing FE. Only for cloud. +DEFINE_mBool(enable_stream_load_commit_txn_on_be, "false"); // The buffer size to store stream table function schema info DEFINE_Int64(stream_tvf_buffer_size, "1048576"); // 1MB @@ -594,8 +599,7 @@ DEFINE_mInt32(memory_gc_sleep_time_ms, "1000"); // Sleep time in milliseconds between memtbale flush mgr refresh iterations DEFINE_mInt64(memtable_mem_tracker_refresh_interval_ms, "5"); -// Sleep time in milliseconds between refresh iterations of workload group memory statistics -DEFINE_mInt64(wg_mem_refresh_interval_ms, "50"); +DEFINE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms, "50"); // percent of (active memtables size / all memtables size) when reach hard limit DEFINE_mInt32(memtable_hard_limit_active_percent, "50"); @@ -628,6 +632,8 @@ DEFINE_Int32(load_process_safe_mem_permit_percent, "5"); // result buffer cancelled time (unit: second) DEFINE_mInt32(result_buffer_cancelled_interval_time, "300"); +DEFINE_mInt32(arrow_flight_result_sink_buffer_size_rows, "32768"); + // the increased frequency of priority for remaining tasks in BlockingPriorityQueue DEFINE_mInt32(priority_queue_remaining_tasks_increased_frequency, "512"); @@ -930,7 +936,8 @@ DEFINE_mInt32(cold_data_compaction_interval_sec, "1800"); DEFINE_String(tmp_file_dir, "tmp"); -DEFINE_Int32(s3_transfer_executor_pool_size, "2"); +DEFINE_Int32(min_s3_file_system_thread_num, "16"); +DEFINE_Int32(max_s3_file_system_thread_num, "64"); DEFINE_Bool(enable_time_lut, "true"); DEFINE_mBool(enable_simdjson_reader, "true"); @@ -1001,6 +1008,7 @@ DEFINE_Bool(enable_index_apply_preds_except_leafnode_of_andnode, "true"); DEFINE_mBool(variant_enable_flatten_nested, "false"); DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1"); DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "1000"); +DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); // block file cache DEFINE_Bool(enable_file_cache, "false"); @@ -1062,8 +1070,6 @@ DEFINE_mInt64(max_tablet_io_errors, "-1"); DEFINE_Int32(tablet_path_check_interval_seconds, "-1"); DEFINE_mInt32(tablet_path_check_batch_size, "1000"); -// Page size of row column, default 4KB -DEFINE_mInt64(row_column_page_size, "4096"); // it must be larger than or equal to 5MB DEFINE_mInt64(s3_write_buffer_size, "5242880"); // Log interval when doing s3 upload task @@ -1234,6 +1240,13 @@ DEFINE_mInt32(s3_read_base_wait_time_ms, "100"); DEFINE_mInt32(s3_read_max_wait_time_ms, "800"); DEFINE_mBool(enable_s3_rate_limiter, "false"); +DEFINE_mInt64(s3_get_bucket_tokens, "1000000000000000000"); +DEFINE_mInt64(s3_get_token_per_second, "1000000000000000000"); +DEFINE_mInt64(s3_get_token_limit, "0"); + +DEFINE_mInt64(s3_put_bucket_tokens, "1000000000000000000"); +DEFINE_mInt64(s3_put_token_per_second, "1000000000000000000"); +DEFINE_mInt64(s3_put_token_limit, "0"); DEFINE_String(trino_connector_plugin_dir, "${DORIS_HOME}/connectors"); @@ -1331,6 +1344,8 @@ DEFINE_mBool(ignore_not_found_file_in_external_table, "true"); DEFINE_mBool(enable_hdfs_mem_limiter, "true"); +DEFINE_mInt16(topn_agg_limit_multiplier, "2"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index dd44f56fd300a9..3c43ed66593e51 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -183,13 +183,14 @@ DECLARE_mBool(enable_query_memory_overcommit); // default gc strategy is conservative, if you want to exclude the interference of gc, let it be true DECLARE_mBool(disable_memory_gc); -// Allocator check failed log stacktrace if not catch exception -DECLARE_mBool(enable_stacktrace_in_allocator_check_failed); +// if false, turn off all stacktrace +DECLARE_mBool(enable_stacktrace); -// malloc or new large memory larger than large_memory_check_bytes, default 2G, -// will print a warning containing the stacktrace, but not prevent memory alloc. -// If is -1, disable large memory check. -DECLARE_mInt64(large_memory_check_bytes); +// when alloc memory larger than stacktrace_in_alloc_large_memory_bytes, default 2G, +// if alloc successful, will print a warning with stacktrace, but not prevent memory alloc. +// if alloc failed using Doris Allocator, will print stacktrace in error log. +// if is -1, disable print stacktrace when alloc large memory. +DECLARE_mInt64(stacktrace_in_alloc_large_memory_bytes); // default is true. if any memory tracking in Orphan mem tracker will report error. DECLARE_mBool(enable_memory_orphan_check); @@ -296,6 +297,7 @@ DECLARE_mInt64(doris_blocking_priority_queue_wait_timeout_ms); // number of scanner thread pool size for olap table // and the min thread num of remote scanner thread pool DECLARE_mInt32(doris_scanner_thread_pool_thread_num); +DECLARE_mInt32(doris_scanner_min_thread_pool_thread_num); // number of batch size to fetch the remote split source DECLARE_mInt32(remote_split_source_batch_size); // max number of remote scanner thread pool size @@ -531,6 +533,8 @@ DECLARE_mInt32(migration_remaining_size_threshold_mb); // If the task runs longer than this time, the task will be terminated, in seconds. // timeout = std::max(migration_task_timeout_secs, tablet size / 1MB/s) DECLARE_mInt32(migration_task_timeout_secs); +// timeout for try_lock migration lock +DECLARE_Int64(migration_lock_timeout_ms); // Port to start debug webserver on DECLARE_Int32(webserver_port); @@ -592,6 +596,8 @@ DECLARE_mInt32(stream_load_record_batch_size); DECLARE_Int32(stream_load_record_expire_time_secs); // time interval to clean expired stream load records DECLARE_mInt64(clean_stream_load_record_interval_secs); +// enable stream load commit txn on BE directly, bypassing FE. Only for cloud. +DECLARE_mBool(enable_stream_load_commit_txn_on_be); // The buffer size to store stream table function schema info DECLARE_Int64(stream_tvf_buffer_size); @@ -650,8 +656,8 @@ DECLARE_mInt32(memory_gc_sleep_time_ms); // Sleep time in milliseconds between memtbale flush mgr memory refresh iterations DECLARE_mInt64(memtable_mem_tracker_refresh_interval_ms); -// Sleep time in milliseconds between refresh iterations of workload group memory statistics -DECLARE_mInt64(wg_mem_refresh_interval_ms); +// Sleep time in milliseconds between refresh iterations of workload group weighted memory ratio +DECLARE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms); // percent of (active memtables size / all memtables size) when reach hard limit DECLARE_mInt32(memtable_hard_limit_active_percent); @@ -684,6 +690,9 @@ DECLARE_Int32(load_process_safe_mem_permit_percent); // result buffer cancelled time (unit: second) DECLARE_mInt32(result_buffer_cancelled_interval_time); +// arrow flight result sink buffer rows size, default 4096 * 8 +DECLARE_mInt32(arrow_flight_result_sink_buffer_size_rows); + // the increased frequency of priority for remaining tasks in BlockingPriorityQueue DECLARE_mInt32(priority_queue_remaining_tasks_increased_frequency); @@ -984,7 +993,8 @@ DECLARE_mInt32(confirm_unused_remote_files_interval_sec); DECLARE_Int32(cold_data_compaction_thread_num); DECLARE_mInt32(cold_data_compaction_interval_sec); -DECLARE_Int32(s3_transfer_executor_pool_size); +DECLARE_Int32(min_s3_file_system_thread_num); +DECLARE_Int32(max_s3_file_system_thread_num); DECLARE_Bool(enable_time_lut); DECLARE_mBool(enable_simdjson_reader); @@ -1111,8 +1121,6 @@ DECLARE_mInt64(max_tablet_io_errors); DECLARE_Int32(tablet_path_check_interval_seconds); DECLARE_mInt32(tablet_path_check_batch_size); -// Page size of row column, default 4KB -DECLARE_mInt64(row_column_page_size); // it must be larger than or equal to 5MB DECLARE_mInt64(s3_write_buffer_size); // Log interval when doing s3 upload task @@ -1191,6 +1199,8 @@ DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column); // Threshold to estimate a column is sparsed // Notice: TEST ONLY DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column); +// Treat invalid json format str as string, instead of throwing exception if false +DECLARE_mBool(variant_throw_exeception_on_invalid_json); DECLARE_mBool(enable_merge_on_write_correctness_check); // USED FOR DEBUGING @@ -1315,6 +1325,13 @@ DECLARE_Int32(spill_io_thread_pool_queue_size); DECLARE_mBool(check_segment_when_build_rowset_meta); DECLARE_mBool(enable_s3_rate_limiter); +DECLARE_mInt64(s3_get_bucket_tokens); +DECLARE_mInt64(s3_get_token_per_second); +DECLARE_mInt64(s3_get_token_limit); + +DECLARE_mInt64(s3_put_bucket_tokens); +DECLARE_mInt64(s3_put_token_per_second); +DECLARE_mInt64(s3_put_token_limit); // max s3 client retry times DECLARE_mInt32(max_s3_client_retry); // When meet s3 429 error, the "get" request will @@ -1421,6 +1438,10 @@ DECLARE_mBool(ignore_not_found_file_in_external_table); DECLARE_mBool(enable_hdfs_mem_limiter); +// Define how many percent data in hashtable bigger than limit +// we should do agg limit opt +DECLARE_mInt16(topn_agg_limit_multiplier); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index c97904f5677b44..d8245f4045ce81 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -228,6 +228,7 @@ void Daemon::memory_maintenance_thread() { DorisMetrics::instance()->system_metrics()->update_allocator_metrics(); } #endif + MemInfo::refresh_memory_bvar(); // Update and print memory stat when the memory changes by 256M. if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) { @@ -387,16 +388,34 @@ void Daemon::je_purge_dirty_pages_thread() const { if (_stop_background_threads_latch.count() == 0) { break; } + if (config::disable_memory_gc) { + continue; + } doris::MemInfo::je_purge_all_arena_dirty_pages(); doris::MemInfo::je_purge_dirty_pages_notify.store(false, std::memory_order_relaxed); } while (true); } -void Daemon::wg_mem_used_refresh_thread() { - // Refresh memory usage and limit of workload groups +void Daemon::cache_prune_stale_thread() { + int32_t interval = config::cache_periodic_prune_stale_sweep_sec; + while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) { + if (interval <= 0) { + LOG(WARNING) << "config of cache clean interval is illegal: [" << interval + << "], force set to 3600 "; + interval = 3600; + } + if (config::disable_memory_gc) { + continue; + } + CacheManager::instance()->for_each_cache_prune_stale(); + } +} + +void Daemon::wg_weighted_memory_ratio_refresh_thread() { + // Refresh weighted memory ratio of workload groups while (!_stop_background_threads_latch.wait_for( - std::chrono::milliseconds(config::wg_mem_refresh_interval_ms))) { - doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_memory_info(); + std::chrono::milliseconds(config::wg_weighted_memory_ratio_refresh_interval_ms))) { + doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit(); } } @@ -435,13 +454,19 @@ void Daemon::start() { st = Thread::create( "Daemon", "je_purge_dirty_pages_thread", [this]() { this->je_purge_dirty_pages_thread(); }, &_threads.emplace_back()); + CHECK(st.ok()) << st; + st = Thread::create( + "Daemon", "cache_prune_stale_thread", [this]() { this->cache_prune_stale_thread(); }, + &_threads.emplace_back()); + CHECK(st.ok()) << st; st = Thread::create( "Daemon", "query_runtime_statistics_thread", [this]() { this->report_runtime_query_statistics_thread(); }, &_threads.emplace_back()); CHECK(st.ok()) << st; st = Thread::create( - "Daemon", "wg_mem_refresh_thread", [this]() { this->wg_mem_used_refresh_thread(); }, + "Daemon", "wg_weighted_memory_ratio_refresh_thread", + [this]() { this->wg_weighted_memory_ratio_refresh_thread(); }, &_threads.emplace_back()); if (config::enable_be_proc_monitor) { diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h index 9dfb079b904ad4..64c9f0c8993ae3 100644 --- a/be/src/common/daemon.h +++ b/be/src/common/daemon.h @@ -43,8 +43,9 @@ class Daemon { void memtable_memory_refresh_thread(); void calculate_metrics_thread(); void je_purge_dirty_pages_thread() const; + void cache_prune_stale_thread(); void report_runtime_query_statistics_thread(); - void wg_mem_used_refresh_thread(); + void wg_weighted_memory_ratio_refresh_thread(); void be_proc_monitor_thread(); CountDownLatch _stop_background_threads_latch; diff --git a/be/src/common/exception.cpp b/be/src/common/exception.cpp index c6139c0f995fa3..48e1229d44e83b 100644 --- a/be/src/common/exception.cpp +++ b/be/src/common/exception.cpp @@ -32,23 +32,4 @@ Exception::Exception(int code, const std::string_view& msg) { LOG(FATAL) << "[ExitOnException] error code: " << code << ", message: " << msg; } } - -Exception::Exception(const Exception& nested, int code, const std::string_view& msg) { - _code = code; - _err_msg = std::make_unique(); - _err_msg->_msg = msg; - if (ErrorCode::error_states[abs(code)].stacktrace) { - _err_msg->_stack = get_stack_trace(); - } - _nested_excption = std::make_unique(); - _nested_excption->_code = nested._code; - _nested_excption->_err_msg = std::make_unique(); - _nested_excption->_err_msg->_msg = nested._err_msg->_msg; - _nested_excption->_err_msg->_stack = nested._err_msg->_stack; - - if (config::exit_on_exception) { - LOG(FATAL) << "[ExitOnException] error code: " << code << ", message: " << msg; - } -} - } // namespace doris \ No newline at end of file diff --git a/be/src/common/exception.h b/be/src/common/exception.h index ce44e6587499b4..37d6afe937e0a2 100644 --- a/be/src/common/exception.h +++ b/be/src/common/exception.h @@ -19,8 +19,8 @@ #include #include -#include +#include #include #include #include @@ -39,9 +39,6 @@ class Exception : public std::exception { Exception() : _code(ErrorCode::OK) {} Exception(int code, const std::string_view& msg); Exception(const Status& status) : Exception(status.code(), status.msg()) {} - // add nested exception as first param, or the template may could not find - // the correct method for ...args - Exception(const Exception& nested, int code, const std::string_view& msg); // Format message with fmt::format, like the logging functions. template @@ -63,7 +60,6 @@ class Exception : public std::exception { std::string _stack; }; std::unique_ptr _err_msg; - std::unique_ptr _nested_excption; mutable std::string _cache_string; }; @@ -71,16 +67,12 @@ inline const std::string& Exception::to_string() const { if (!_cache_string.empty()) { return _cache_string; } - std::stringstream ostr; - ostr << "[E" << _code << "] "; - ostr << (_err_msg ? _err_msg->_msg : ""); + fmt::memory_buffer buf; + fmt::format_to(buf, "[E{}] {}", _code, _err_msg ? _err_msg->_msg : ""); if (_err_msg && !_err_msg->_stack.empty()) { - ostr << '\n' << _err_msg->_stack; + fmt::format_to(buf, "\n{}", _err_msg->_stack); } - if (_nested_excption != nullptr) { - ostr << '\n' << "Caused by:" << _nested_excption->to_string(); - } - _cache_string = ostr.str(); + _cache_string = fmt::to_string(buf); return _cache_string; } @@ -139,3 +131,26 @@ inline const std::string& Exception::to_string() const { } \ } \ } while (0); + +#define HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(stmt, exception_handler) \ + do { \ + try { \ + doris::enable_thread_catch_bad_alloc++; \ + Defer defer {[&]() { doris::enable_thread_catch_bad_alloc--; }}; \ + { \ + Status _status_ = (stmt); \ + if (UNLIKELY(!_status_.ok())) { \ + exception_handler(doris::Exception()); \ + return _status_; \ + } \ + } \ + } catch (const doris::Exception& e) { \ + exception_handler(e); \ + if (e.code() == doris::ErrorCode::MEM_ALLOC_FAILED) { \ + return Status::MemoryLimitExceeded(fmt::format( \ + "PreCatch error code:{}, {}, __FILE__:{}, __LINE__:{}, __FUNCTION__:{}", \ + e.code(), e.to_string(), __FILE__, __LINE__, __PRETTY_FUNCTION__)); \ + } \ + return Status::Error(e.code(), e.to_string()); \ + } \ + } while (0); diff --git a/be/src/common/status.h b/be/src/common/status.h index c7b815e6b4e03f..11c7c42ac99496 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -17,11 +17,9 @@ #include #include "common/compiler_util.h" // IWYU pragma: keep -#ifdef ENABLE_STACKTRACE -#include "util/stack_util.h" -#endif - +#include "common/config.h" #include "common/expected.h" +#include "util/stack_util.h" namespace doris { @@ -363,9 +361,9 @@ class [[nodiscard]] Status { Status(int code, std::string msg, std::string stack = "") : _code(code) { _err_msg = std::make_unique(); _err_msg->_msg = std::move(msg); -#ifdef ENABLE_STACKTRACE - _err_msg->_stack = std::move(stack); -#endif + if (config::enable_stacktrace) { + _err_msg->_stack = std::move(stack); + } } // copy c'tor makes copy of error detail so Status can be returned by value @@ -416,13 +414,12 @@ class [[nodiscard]] Status { } else { status._err_msg->_msg = fmt::format(msg, std::forward(args)...); } -#ifdef ENABLE_STACKTRACE - if (stacktrace && ErrorCode::error_states[abs(code)].stacktrace) { + if (stacktrace && ErrorCode::error_states[abs(code)].stacktrace && + config::enable_stacktrace) { // Delete the first one frame pointers, which are inside the status.h status._err_msg->_stack = get_stack_trace(1); LOG(WARNING) << "meet error status: " << status; // may print too many stacks. } -#endif return status; } @@ -436,12 +433,11 @@ class [[nodiscard]] Status { } else { status._err_msg->_msg = fmt::format(msg, std::forward(args)...); } -#ifdef ENABLE_STACKTRACE - if (stacktrace && ErrorCode::error_states[abs(code)].stacktrace) { + if (stacktrace && ErrorCode::error_states[abs(code)].stacktrace && + config::enable_stacktrace) { status._err_msg->_stack = get_stack_trace(1); LOG(WARNING) << "meet error status: " << status; // may print too many stacks. } -#endif return status; } @@ -545,9 +541,7 @@ class [[nodiscard]] Status { int _code; struct ErrMsg { std::string _msg; -#ifdef ENABLE_STACKTRACE std::string _stack; -#endif }; std::unique_ptr _err_msg; @@ -604,11 +598,9 @@ class AtomicStatus { inline std::ostream& operator<<(std::ostream& ostr, const Status& status) { ostr << '[' << status.code_as_string() << ']'; ostr << status.msg(); -#ifdef ENABLE_STACKTRACE - if (status._err_msg && !status._err_msg->_stack.empty()) { + if (status._err_msg && !status._err_msg->_stack.empty() && config::enable_stacktrace) { ostr << '\n' << status._err_msg->_stack; } -#endif return ostr; } diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 778444346d3d4c..c2199cd6be9360 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -44,6 +44,7 @@ #include "vec/io/io_helper.h" #include "vec/runtime/ipv4_value.h" #include "vec/runtime/ipv6_value.h" +#include "vec/runtime/time_value.h" #include "vec/runtime/vdatetime_value.h" namespace doris { @@ -70,6 +71,8 @@ std::string cast_to_string(T value, int scale) { std::stringstream ss; ss << buf; return ss.str(); + } else if constexpr (primitive_type == TYPE_TIMEV2) { + return TimeValue::to_string(value, scale); } else if constexpr (primitive_type == TYPE_IPV4) { return IPv4Value::to_string(value); } else if constexpr (primitive_type == TYPE_IPV6) { diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index de9857bad2ce83..2ddb3db295b487 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -48,10 +48,14 @@ #include "exec/schema_scanner/schema_user_scanner.h" #include "exec/schema_scanner/schema_variables_scanner.h" #include "exec/schema_scanner/schema_views_scanner.h" +#include "exec/schema_scanner/schema_workload_group_privileges.h" #include "exec/schema_scanner/schema_workload_groups_scanner.h" #include "exec/schema_scanner/schema_workload_sched_policy_scanner.h" #include "olap/hll.h" +#include "pipeline/dependency.h" #include "runtime/define_primitive_type.h" +#include "runtime/fragment_mgr.h" +#include "runtime/types.h" #include "util/string_util.h" #include "util/types.h" #include "vec/columns/column.h" @@ -65,6 +69,7 @@ #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" namespace doris { class ObjectPool; @@ -85,7 +90,60 @@ Status SchemaScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaScanner::get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos) { + if (_data_block == nullptr) { + return Status::InternalError("No data left!"); + } + DCHECK(_async_thread_running == false); + RETURN_IF_ERROR(_scanner_status.status()); + for (size_t i = 0; i < block->columns(); i++) { + std::move(*block->get_by_position(i).column) + .mutate() + ->insert_range_from(*_data_block->get_by_position(i).column, 0, + _data_block->rows()); + } + _data_block->clear_column_data(); + *eos = _eos; + if (!*eos) { + RETURN_IF_ERROR(get_next_block_async(state)); + } + return Status::OK(); +} + +Status SchemaScanner::get_next_block_async(RuntimeState* state) { + _dependency->block(); + auto task_ctx = state->get_task_execution_context(); + RETURN_IF_ERROR(ExecEnv::GetInstance()->fragment_mgr()->get_thread_pool()->submit_func( + [this, task_ctx, state]() { + DCHECK(_async_thread_running == false); + auto task_lock = task_ctx.lock(); + if (task_lock == nullptr) { + _scanner_status.update(Status::InternalError("Task context not exists!")); + return; + } + SCOPED_ATTACH_TASK(state); + _dependency->block(); + _async_thread_running = true; + _finish_dependency->block(); + if (!_opened) { + _data_block = vectorized::Block::create_unique(); + _init_block(_data_block.get()); + _scanner_status.update(start(state)); + _opened = true; + } + bool eos = false; + _scanner_status.update(get_next_block_internal(_data_block.get(), &eos)); + _eos = eos; + _async_thread_running = false; + _dependency->set_ready(); + if (eos) { + _finish_dependency->set_ready(); + } + })); + return Status::OK(); +} + +Status SchemaScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("used before initialized."); } @@ -170,12 +228,24 @@ std::unique_ptr SchemaScanner::create(TSchemaTableType::type type return SchemaWorkloadSchedulePolicyScanner::create_unique(); case TSchemaTableType::SCH_TABLE_OPTIONS: return SchemaTableOptionsScanner::create_unique(); + case TSchemaTableType::SCH_WORKLOAD_GROUP_PRIVILEGES: + return SchemaWorkloadGroupPrivilegesScanner::create_unique(); default: return SchemaDummyScanner::create_unique(); break; } } +void SchemaScanner::_init_block(vectorized::Block* src_block) { + const std::vector& columns_desc(get_column_desc()); + for (int i = 0; i < columns_desc.size(); ++i) { + TypeDescriptor descriptor(columns_desc[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + src_block->insert(vectorized::ColumnWithTypeAndName(data_type->create_column(), data_type, + columns_desc[i].name)); + } +} + Status SchemaScanner::fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas) { const ColumnDesc& col_desc = _columns[pos]; diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index a23706ac6a440a..da61d58b943fc4 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,10 @@ namespace vectorized { class Block; } +namespace pipeline { +class Dependency; +} + struct SchemaScannerCommonParam { SchemaScannerCommonParam() : db(nullptr), @@ -64,6 +69,7 @@ struct SchemaScannerCommonParam { int32_t port; // frontend thrift port int64_t thread_id; const std::string* catalog = nullptr; + std::set fe_addr_list; }; // scanner parameter from frontend @@ -94,15 +100,23 @@ class SchemaScanner { // init object need information, schema etc. virtual Status init(SchemaScannerParam* param, ObjectPool* pool); + Status get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos); // Start to work virtual Status start(RuntimeState* state); - virtual Status get_next_block(vectorized::Block* block, bool* eos); + virtual Status get_next_block_internal(vectorized::Block* block, bool* eos); const std::vector& get_column_desc() const { return _columns; } // factory function static std::unique_ptr create(TSchemaTableType::type type); TSchemaTableType::type type() const { return _schema_table_type; } + void set_dependency(std::shared_ptr dep, + std::shared_ptr fin_dep) { + _dependency = dep; + _finish_dependency = fin_dep; + } + Status get_next_block_async(RuntimeState* state); protected: + void _init_block(vectorized::Block* src_block); Status fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas); @@ -125,6 +139,15 @@ class SchemaScanner { RuntimeProfile::Counter* _get_table_timer = nullptr; RuntimeProfile::Counter* _get_describe_timer = nullptr; RuntimeProfile::Counter* _fill_block_timer = nullptr; + + std::shared_ptr _dependency = nullptr; + std::shared_ptr _finish_dependency = nullptr; + + std::unique_ptr _data_block; + AtomicStatus _scanner_status; + std::atomic _eos = false; + std::atomic _opened = false; + std::atomic _async_thread_running = false; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp index 2115a38a6ebce3..46522a36242fc1 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp @@ -137,7 +137,7 @@ Status SchemaActiveQueriesScanner::_get_active_queries_block_from_fe() { return Status::OK(); } -Status SchemaActiveQueriesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaActiveQueriesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.h b/be/src/exec/schema_scanner/schema_active_queries_scanner.h index 1df5b1f9d7402d..7e9ae4b8034083 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.h +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.h @@ -36,7 +36,7 @@ class SchemaActiveQueriesScanner : public SchemaScanner { ~SchemaActiveQueriesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp index f1155796ed434d..b35e84a9f9c9f4 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp @@ -51,7 +51,8 @@ Status SchemaBackendActiveTasksScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaBackendActiveTasksScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaBackendActiveTasksScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.h b/be/src/exec/schema_scanner/schema_backend_active_tasks.h index d8a2a1ffa3f96a..43819818b57f69 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.h +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.h @@ -36,7 +36,7 @@ class SchemaBackendActiveTasksScanner : public SchemaScanner { ~SchemaBackendActiveTasksScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp index 534f045341b7e3..d06cd8fa745634 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp @@ -48,7 +48,7 @@ SchemaCharsetsScanner::SchemaCharsetsScanner() SchemaCharsetsScanner::~SchemaCharsetsScanner() {} -Status SchemaCharsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCharsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.h b/be/src/exec/schema_scanner/schema_charsets_scanner.h index 1f01070875ccf6..d5089c62826b0b 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.h @@ -36,7 +36,7 @@ class SchemaCharsetsScanner : public SchemaScanner { SchemaCharsetsScanner(); ~SchemaCharsetsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CharsetStruct { diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.cpp b/be/src/exec/schema_scanner/schema_collations_scanner.cpp index 9d50b5216303d8..8592eb7575c387 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_collations_scanner.cpp @@ -50,7 +50,7 @@ SchemaCollationsScanner::SchemaCollationsScanner() SchemaCollationsScanner::~SchemaCollationsScanner() {} -Status SchemaCollationsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCollationsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.h b/be/src/exec/schema_scanner/schema_collations_scanner.h index f0f60538cacce0..2fe200da78d04d 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.h +++ b/be/src/exec/schema_scanner/schema_collations_scanner.h @@ -36,7 +36,7 @@ class SchemaCollationsScanner : public SchemaScanner { SchemaCollationsScanner(); ~SchemaCollationsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CollationStruct { diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index deda8af7d8de58..f4e15d2aef0af2 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -347,7 +347,7 @@ Status SchemaColumnsScanner::_get_new_table() { return Status::OK(); } -Status SchemaColumnsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaColumnsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("use this class before inited."); } diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.h b/be/src/exec/schema_scanner/schema_columns_scanner.h index 2499db7ed82a2b..99150c36d109a2 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.h +++ b/be/src/exec/schema_scanner/schema_columns_scanner.h @@ -38,7 +38,7 @@ class SchemaColumnsScanner : public SchemaScanner { SchemaColumnsScanner(); ~SchemaColumnsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp index 1d5956f390ea26..9e3a703d9fb5d6 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp @@ -40,7 +40,7 @@ Status SchemaDummyScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaDummyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaDummyScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { *eos = true; return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.h b/be/src/exec/schema_scanner/schema_dummy_scanner.h index a67f6fa25c1648..0c5e4aabe357e4 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.h +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.h @@ -33,7 +33,7 @@ class SchemaDummyScanner : public SchemaScanner { SchemaDummyScanner(); ~SchemaDummyScanner() override; Status start(RuntimeState* state = nullptr) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_files_scanner.cpp b/be/src/exec/schema_scanner/schema_files_scanner.cpp index 55b7a338c319e8..20aa07fa69116c 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_files_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaFilesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaFilesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaFilesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_files_scanner.h b/be/src/exec/schema_scanner/schema_files_scanner.h index 6805a04be4aacc..bb3b2d68493147 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.h +++ b/be/src/exec/schema_scanner/schema_files_scanner.h @@ -38,7 +38,7 @@ class SchemaFilesScanner : public SchemaScanner { ~SchemaFilesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp index 928567a2e4a99e..1267c32c8d8dfb 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp @@ -225,7 +225,7 @@ Status SchemaMetadataNameIdsScanner::_fill_block_impl(vectorized::Block* block) return Status::OK(); } -Status SchemaMetadataNameIdsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaMetadataNameIdsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h index 9981d441d856aa..c3beea7769754d 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h @@ -39,7 +39,7 @@ class SchemaMetadataNameIdsScanner : public SchemaScanner { ~SchemaMetadataNameIdsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp index f1ad1f594f883f..ea7394e15e12d2 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp @@ -101,7 +101,7 @@ Status SchemaPartitionsScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaPartitionsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaPartitionsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.h b/be/src/exec/schema_scanner/schema_partitions_scanner.h index 47e1d1fcf87d15..87e55db984a3de 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.h +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.h @@ -38,7 +38,7 @@ class SchemaPartitionsScanner : public SchemaScanner { ~SchemaPartitionsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp index 0f270a6a8c1777..185ef2ab44237f 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp @@ -56,14 +56,19 @@ Status SchemaProcessListScanner::start(RuntimeState* state) { TShowProcessListRequest request; request.__set_show_full_sql(true); - RETURN_IF_ERROR(SchemaHelper::show_process_list(*(_param->common_param->ip), - _param->common_param->port, request, - &_process_list_result)); + for (const auto& fe_addr : _param->common_param->fe_addr_list) { + TShowProcessListResult tmp_ret; + RETURN_IF_ERROR( + SchemaHelper::show_process_list(fe_addr.hostname, fe_addr.port, request, &tmp_ret)); + _process_list_result.process_list.insert(_process_list_result.process_list.end(), + tmp_ret.process_list.begin(), + tmp_ret.process_list.end()); + } return Status::OK(); } -Status SchemaProcessListScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProcessListScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.h b/be/src/exec/schema_scanner/schema_processlist_scanner.h index 8aae87e1ef6d0f..c0b0a47f6154ee 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.h +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.h @@ -40,7 +40,7 @@ class SchemaProcessListScanner : public SchemaScanner { ~SchemaProcessListScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_processlist_columns; diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp index 2f71eb96f2613a..0a2a64330bb018 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp @@ -88,7 +88,7 @@ Status SchemaProfilingScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaProfilingScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProfilingScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.h b/be/src/exec/schema_scanner/schema_profiling_scanner.h index 5399cb14eb43f5..6b969a478aca69 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.h +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.h @@ -38,7 +38,7 @@ class SchemaProfilingScanner : public SchemaScanner { ~SchemaProfilingScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; }; diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.cpp b/be/src/exec/schema_scanner/schema_routine_scanner.cpp index 3d55addee6c093..8c263c99d2d6c8 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_routine_scanner.cpp @@ -141,7 +141,7 @@ Status SchemaRoutinesScanner::get_block_from_fe() { return Status::OK(); } -Status SchemaRoutinesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRoutinesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.h b/be/src/exec/schema_scanner/schema_routine_scanner.h index 543f9e8e8f684a..c60d72340e1104 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.h +++ b/be/src/exec/schema_scanner/schema_routine_scanner.h @@ -36,7 +36,7 @@ class SchemaRoutinesScanner : public SchemaScanner { ~SchemaRoutinesScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp index 6ece8e22331e38..16d5f2daba61e7 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp @@ -97,7 +97,7 @@ Status SchemaRowsetsScanner::_get_all_rowsets() { return Status::OK(); } -Status SchemaRowsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRowsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.h b/be/src/exec/schema_scanner/schema_rowsets_scanner.h index b975cc4231bc20..cad34fc04945e4 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.h @@ -40,7 +40,7 @@ class SchemaRowsetsScanner : public SchemaScanner { ~SchemaRowsetsScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_all_rowsets(); diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp index 9789b6c72d6f30..f529821e5a54e2 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp @@ -82,7 +82,7 @@ Status SchemaSchemaPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaSchemaPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemaPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h index af2ad49634bd49..9522fba908bb2a 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemaPrivilegesScanner : public SchemaScanner { ~SchemaSchemaPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp index 1854e4f2b54af1..618e831c90e219 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaSchemataScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaSchemataScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemataScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before Initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.h b/be/src/exec/schema_scanner/schema_schemata_scanner.h index 46fad31af1fd5e..39a5ddda495bdd 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.h +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemataScanner : public SchemaScanner { ~SchemaSchemataScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _fill_block_impl(vectorized::Block* block); diff --git a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp index 604da59b6377b6..e0481599a2f004 100644 --- a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp @@ -103,7 +103,7 @@ Status SchemaTableOptionsScanner::get_block_from_fe() { return Status::OK(); } -Status SchemaTableOptionsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTableOptionsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_table_options_scanner.h b/be/src/exec/schema_scanner/schema_table_options_scanner.h index d40f1b73c633c5..95c8bdb89076d7 100644 --- a/be/src/exec/schema_scanner/schema_table_options_scanner.h +++ b/be/src/exec/schema_scanner/schema_table_options_scanner.h @@ -36,7 +36,7 @@ class SchemaTableOptionsScanner : public SchemaScanner { ~SchemaTableOptionsScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp index fe8aa725b73b80..cdeac2b70dcadd 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp @@ -84,7 +84,7 @@ Status SchemaTablePrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaTablePrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablePrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h index aa79c88304b7c5..4cfcc16d3583ce 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaTablePrivilegesScanner : public SchemaScanner { ~SchemaTablePrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.cpp b/be/src/exec/schema_scanner/schema_tables_scanner.cpp index 093acf9cecbcb1..23710b81971c15 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_tables_scanner.cpp @@ -342,7 +342,7 @@ Status SchemaTablesScanner::_fill_block_impl(vectorized::Block* block) { return Status::OK(); } -Status SchemaTablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.h b/be/src/exec/schema_scanner/schema_tables_scanner.h index 11a96bf65d5271..7f8eb11f397e06 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.h +++ b/be/src/exec/schema_scanner/schema_tables_scanner.h @@ -39,7 +39,7 @@ class SchemaTablesScanner : public SchemaScanner { ~SchemaTablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp index 6a12d846fbd560..3eeabc0e4a0917 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaUserPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaUserPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h index eb8f3c63f1433b..ffc3840db676c4 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaUserPrivilegesScanner : public SchemaScanner { ~SchemaUserPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_scanner.cpp b/be/src/exec/schema_scanner/schema_user_scanner.cpp index 9b153414380350..e56f18f05aea93 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_scanner.cpp @@ -76,7 +76,7 @@ Status SchemaUserScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaUserScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_user_scanner.h b/be/src/exec/schema_scanner/schema_user_scanner.h index c55f216804d5dd..bdc618eb5a0332 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_scanner.h @@ -40,7 +40,7 @@ class SchemaUserScanner : public SchemaScanner { ~SchemaUserScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_user_columns; diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.cpp b/be/src/exec/schema_scanner/schema_variables_scanner.cpp index 546a0a471cfb01..ad4d5d072cb03f 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_variables_scanner.cpp @@ -40,7 +40,8 @@ std::vector SchemaVariablesScanner::_s_vars_columns = // name, type, size {"VARIABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), false}, {"VARIABLE_VALUE", TYPE_VARCHAR, sizeof(StringRef), false}, -}; + {"DEFAULT_VALUE", TYPE_VARCHAR, sizeof(StringRef), false}, + {"CHANGED", TYPE_VARCHAR, sizeof(StringRef), false}}; SchemaVariablesScanner::SchemaVariablesScanner(TVarType::type type) : SchemaScanner(_s_vars_columns, TSchemaTableType::SCH_VARIABLES), _type(type) {} @@ -70,7 +71,7 @@ Status SchemaVariablesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaVariablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaVariablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } @@ -94,7 +95,7 @@ Status SchemaVariablesScanner::_fill_block_impl(vectorized::Block* block) { std::vector strs(row_num); int idx = 0; for (auto& it : _var_result.variables) { - strs[idx] = StringRef(it.first.c_str(), it.first.size()); + strs[idx] = StringRef(it[0].c_str(), it[0].size()); datas[idx] = strs.data() + idx; ++idx; } @@ -105,12 +106,34 @@ Status SchemaVariablesScanner::_fill_block_impl(vectorized::Block* block) { std::vector strs(row_num); int idx = 0; for (auto& it : _var_result.variables) { - strs[idx] = StringRef(it.second.c_str(), it.second.size()); + strs[idx] = StringRef(it[1].c_str(), it[1].size()); datas[idx] = strs.data() + idx; ++idx; } RETURN_IF_ERROR(fill_dest_column_for_range(block, 1, datas)); } + // default value + { + std::vector strs(row_num); + int idx = 0; + for (auto& it : _var_result.variables) { + strs[idx] = StringRef(it[2].c_str(), it[2].size()); + datas[idx] = strs.data() + idx; + ++idx; + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 2, datas)); + } + // changed + { + std::vector strs(row_num); + int idx = 0; + for (auto& it : _var_result.variables) { + strs[idx] = StringRef(it[3].c_str(), it[3].size()); + datas[idx] = strs.data() + idx; + ++idx; + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 3, datas)); + } return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.h b/be/src/exec/schema_scanner/schema_variables_scanner.h index 2d207ff8b2e6c2..31bbacf713be0f 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.h +++ b/be/src/exec/schema_scanner/schema_variables_scanner.h @@ -40,7 +40,7 @@ class SchemaVariablesScanner : public SchemaScanner { ~SchemaVariablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct VariableStruct { diff --git a/be/src/exec/schema_scanner/schema_views_scanner.cpp b/be/src/exec/schema_scanner/schema_views_scanner.cpp index 6c3b5f2e21bc3a..f47766ef3567ad 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_views_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaViewsScanner::_get_new_table() { return Status::OK(); } -Status SchemaViewsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaViewsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_views_scanner.h b/be/src/exec/schema_scanner/schema_views_scanner.h index bc473057905a12..b86ad922e5e76a 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.h +++ b/be/src/exec/schema_scanner/schema_views_scanner.h @@ -38,7 +38,7 @@ class SchemaViewsScanner : public SchemaScanner { ~SchemaViewsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp new file mode 100644 index 00000000000000..a1d4568d9053cd --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_workload_group_privileges.h" + +#include "runtime/client_cache.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "util/thrift_rpc_helper.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { +std::vector SchemaWorkloadGroupPrivilegesScanner::_s_tbls_columns = { + {"GRANTEE", TYPE_VARCHAR, sizeof(StringRef), true}, + {"WORKLOAD_GROUP_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, + {"PRIVILEGE_TYPE", TYPE_VARCHAR, sizeof(StringRef), true}, + {"IS_GRANTABLE", TYPE_VARCHAR, sizeof(StringRef), true}, +}; + +SchemaWorkloadGroupPrivilegesScanner::SchemaWorkloadGroupPrivilegesScanner() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_WORKLOAD_GROUPS) {} + +SchemaWorkloadGroupPrivilegesScanner::~SchemaWorkloadGroupPrivilegesScanner() {} + +Status SchemaWorkloadGroupPrivilegesScanner::start(RuntimeState* state) { + _block_rows_limit = state->batch_size(); + _rpc_timeout = state->execution_timeout() * 1000; + return Status::OK(); +} + +Status SchemaWorkloadGroupPrivilegesScanner::_get_workload_group_privs_block_from_fe() { + TNetworkAddress master_addr = ExecEnv::GetInstance()->master_info()->network_address; + + TSchemaTableRequestParams schema_table_request_params; + for (int i = 0; i < _s_tbls_columns.size(); i++) { + schema_table_request_params.__isset.columns_name = true; + schema_table_request_params.columns_name.emplace_back(_s_tbls_columns[i].name); + } + schema_table_request_params.__set_current_user_ident(*_param->common_param->current_user_ident); + + TFetchSchemaTableDataRequest request; + request.__set_schema_table_name(TSchemaTableName::WORKLOAD_GROUP_PRIVILEGES); + request.__set_schema_table_params(schema_table_request_params); + + TFetchSchemaTableDataResult result; + + RETURN_IF_ERROR(ThriftRpcHelper::rpc( + master_addr.hostname, master_addr.port, + [&request, &result](FrontendServiceConnection& client) { + client->fetchSchemaTableData(result, request); + }, + _rpc_timeout)); + + Status status(Status::create(result.status)); + if (!status.ok()) { + LOG(WARNING) << "fetch workload group privileges from FE failed, errmsg=" << status; + return status; + } + std::vector result_data = result.data_batch; + + _workload_groups_privs_block = vectorized::Block::create_unique(); + for (int i = 0; i < _s_tbls_columns.size(); ++i) { + TypeDescriptor descriptor(_s_tbls_columns[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + _workload_groups_privs_block->insert(vectorized::ColumnWithTypeAndName( + data_type->create_column(), data_type, _s_tbls_columns[i].name)); + } + + if (result_data.size() > 0) { + int col_size = result_data[0].column_value.size(); + if (col_size != _s_tbls_columns.size()) { + return Status::InternalError( + "workload group privileges schema is not match for FE and BE"); + } + } + + _workload_groups_privs_block->reserve(result_data.size()); + + for (int i = 0; i < result_data.size(); i++) { + TRow row = result_data[i]; + + for (int j = 0; j < _s_tbls_columns.size(); j++) { + RETURN_IF_ERROR(insert_block_column(row.column_value[j], j, + _workload_groups_privs_block.get(), + _s_tbls_columns[j].type)); + } + } + return Status::OK(); +} + +Status SchemaWorkloadGroupPrivilegesScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + if (_workload_groups_privs_block == nullptr) { + RETURN_IF_ERROR(_get_workload_group_privs_block_from_fe()); + _total_rows = _workload_groups_privs_block->rows(); + } + + if (_row_idx == _total_rows) { + *eos = true; + return Status::OK(); + } + + int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); + vectorized::MutableBlock mblock = vectorized::MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR( + mblock.add_rows(_workload_groups_privs_block.get(), _row_idx, current_batch_rows)); + _row_idx += current_batch_rows; + + *eos = _row_idx == _total_rows; + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_group_privileges.h b/be/src/exec/schema_scanner/schema_workload_group_privileges.h new file mode 100644 index 00000000000000..0a7bf1258eed1f --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_privileges.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaWorkloadGroupPrivilegesScanner : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaWorkloadGroupPrivilegesScanner); + +public: + SchemaWorkloadGroupPrivilegesScanner(); + ~SchemaWorkloadGroupPrivilegesScanner() override; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + Status _get_workload_group_privs_block_from_fe(); + + int _block_rows_limit = 4096; + int _row_idx = 0; + int _total_rows = 0; + std::unique_ptr _workload_groups_privs_block = nullptr; + int _rpc_timeout = 3000; +}; +}; // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index def52df531df7b..dd81a3ecb267da 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -42,6 +42,8 @@ std::vector SchemaWorkloadGroupsScanner::_s_tbls_colu {"SPILL_THRESHOLD_LOW_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, {"SPILL_THRESHOLD_HIGH_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, {"TAG", TYPE_VARCHAR, sizeof(StringRef), true}, + {"READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, + {"REMOTE_READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, }; SchemaWorkloadGroupsScanner::SchemaWorkloadGroupsScanner() @@ -114,7 +116,7 @@ Status SchemaWorkloadGroupsScanner::_get_workload_groups_block_from_fe() { return Status::OK(); } -Status SchemaWorkloadGroupsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadGroupsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h index bf7a103526dc80..3121c4dbac149e 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadGroupsScanner : public SchemaScanner { ~SchemaWorkloadGroupsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp index 035d3bfe217aec..2d91f151f5f2bb 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp @@ -106,7 +106,8 @@ Status SchemaWorkloadSchedulePolicyScanner::_get_workload_schedule_policy_block_ return Status::OK(); } -Status SchemaWorkloadSchedulePolicyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h index 5284975fe66b31..da8d9f15c4989e 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadSchedulePolicyScanner : public SchemaScanner { ~SchemaWorkloadSchedulePolicyScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index a831395a5eabf1..e88f692a23db8a 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -100,8 +100,12 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { virtual ~BloomFilterFuncBase() = default; void init_params(const RuntimeFilterParams* params) { - _bloom_filter_length = params->bloom_filter_size; + _bloom_filter_length = + params->runtime_bloom_filter_min_size > 0 + ? std::max(params->bloom_filter_size, params->runtime_bloom_filter_min_size) + : params->bloom_filter_size; _build_bf_exactly = params->build_bf_exactly; + _runtime_bloom_filter_min_size = params->runtime_bloom_filter_min_size; _null_aware = params->null_aware; _bloom_filter_size_calculated_by_ndv = params->bloom_filter_size_calculated_by_ndv; } @@ -124,9 +128,16 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { // if FE do use ndv stat to predict the bf size, BE only use the row count. FE have more // exactly row count stat. which one is min is more correctly. if (_bloom_filter_size_calculated_by_ndv) { - _bloom_filter_length = std::min(be_calculate_size, _bloom_filter_length); + _bloom_filter_length = + _runtime_bloom_filter_min_size > 0 + ? std::max(_runtime_bloom_filter_min_size, + std::min(be_calculate_size, _bloom_filter_length)) + : std::min(be_calculate_size, _bloom_filter_length); } else { - _bloom_filter_length = be_calculate_size; + _bloom_filter_length = + _runtime_bloom_filter_min_size > 0 + ? std::max(_runtime_bloom_filter_min_size, be_calculate_size) + : be_calculate_size; } } return init_with_fixed_length(_bloom_filter_length); @@ -221,8 +232,9 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { // bloom filter size int32_t _bloom_filter_alloced; std::shared_ptr _bloom_filter; - bool _inited {}; + bool _inited = false; int64_t _bloom_filter_length; + int64_t _runtime_bloom_filter_min_size; bool _build_bf_exactly = false; bool _bloom_filter_size_calculated_by_ndv = false; }; diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 11889ff2ec349b..4808caa00f37d0 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -34,7 +34,9 @@ class MinmaxFunctionTraits { using BasePtr = MinMaxFuncBase*; template static BasePtr get_function() { - return new MinMaxNumFunc::CppType>(); + using CppType = typename PrimitiveTypeTraits::CppType; + return new MinMaxNumFunc< + std::conditional_t, std::string, CppType>>(); } }; diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 205ee5a5d20b92..5e3fb136929595 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -353,4 +353,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value } } +// root path "$." +bool JsonFunctions::is_root_path(const std::vector& json_path) { + return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 72aa522ff374fa..11970eb8c46c56 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -116,6 +116,8 @@ class JsonFunctions { static std::string print_json_value(const rapidjson::Value& value); + static bool is_root_path(const std::vector& json_path); + private: static rapidjson::Value* match_value(const std::vector& parsed_paths, rapidjson::Value* document, diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h index b4291e2edb7e6b..377b33696c82b9 100644 --- a/be/src/exprs/minmax_predicate.h +++ b/be/src/exprs/minmax_predicate.h @@ -26,6 +26,7 @@ #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" #include "vec/common/assert_cast.h" +#include "vec/common/string_ref.h" namespace doris { // only used in Runtime Filter @@ -75,19 +76,22 @@ class MinMaxNumFunc : public MinMaxFuncBase { for (size_t i = start; i < size; i++) { if (nullmap == nullptr || !nullmap[i]) { if constexpr (NeedMin) { - _min = std::min(_min, column_string.get_data_at(i)); + if (column_string.get_data_at(i) < StringRef(_min)) { + _min = column_string.get_data_at(i).to_string(); + } } if constexpr (NeedMax) { - _max = std::max(_max, column_string.get_data_at(i)); + if (column_string.get_data_at(i) > StringRef(_max)) { + _max = column_string.get_data_at(i).to_string(); + } } } } - store_string_ref(); } void update_batch(const vectorized::ColumnPtr& column, size_t start) { const auto size = column->size(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { if (column->is_column_string64()) { _update_batch_string(assert_cast(*column), nullptr, start, size); @@ -111,7 +115,7 @@ class MinMaxNumFunc : public MinMaxFuncBase { void update_batch(const vectorized::ColumnPtr& column, const vectorized::NullMap& nullmap, size_t start) { const auto size = column->size(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { if (column->is_column_string64()) { _update_batch_string(assert_cast(*column), nullmap.data(), start, size); @@ -135,26 +139,15 @@ class MinMaxNumFunc : public MinMaxFuncBase { } Status merge(MinMaxFuncBase* minmax_func) override { - if constexpr (std::is_same_v) { - auto* other_minmax = static_cast*>(minmax_func); - if constexpr (NeedMin) { - _min = std::min(_min, other_minmax->_min); - } - if constexpr (NeedMax) { - _max = std::max(_max, other_minmax->_max); - } - store_string_ref(); - } else { - auto* other_minmax = static_cast*>(minmax_func); - if constexpr (NeedMin) { - if (other_minmax->_min < _min) { - _min = other_minmax->_min; - } + auto* other_minmax = static_cast*>(minmax_func); + if constexpr (NeedMin) { + if (other_minmax->_min < _min) { + _min = other_minmax->_min; } - if constexpr (NeedMax) { - if (other_minmax->_max > _max) { - _max = other_minmax->_max; - } + } + if constexpr (NeedMax) { + if (other_minmax->_max > _max) { + _max = other_minmax->_max; } } @@ -172,28 +165,9 @@ class MinMaxNumFunc : public MinMaxFuncBase { return Status::OK(); } - void store_string_ref() { - if constexpr (std::is_same_v) { - if constexpr (NeedMin) { - if (_min.data != _stored_min.data()) { - _stored_min = _min.to_string(); - _min = StringRef(_stored_min); - } - } - if constexpr (NeedMax) { - if (_max.data != _stored_max.data()) { - _stored_max = _max.to_string(); - _max = StringRef(_stored_max); - } - } - } - } - protected: T _max = type_limit::min(); T _min = type_limit::max(); - std::string _stored_min; - std::string _stored_max; }; template diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index e69ff714d32978..d7fc2ff7490761 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -35,7 +35,6 @@ #include "agent/be_exec_version_manager.h" #include "common/logging.h" -#include "common/object_pool.h" #include "common/status.h" #include "exprs/bitmapfilter_predicate.h" #include "exprs/bloom_filter_func.h" @@ -281,15 +280,13 @@ Status create_vbin_predicate(const TypeDescriptor& type, TExprOpcode::type opcod // This class is a wrapper of runtime predicate function class RuntimePredicateWrapper { public: - RuntimePredicateWrapper(ObjectPool* pool, const RuntimeFilterParams* params) - : RuntimePredicateWrapper(pool, params->column_return_type, params->filter_type, + RuntimePredicateWrapper(const RuntimeFilterParams* params) + : RuntimePredicateWrapper(params->column_return_type, params->filter_type, params->filter_id) {}; // for a 'tmp' runtime predicate wrapper // only could called assign method or as a param for merge - RuntimePredicateWrapper(ObjectPool* pool, PrimitiveType column_type, RuntimeFilterType type, - uint32_t filter_id) - : _pool(pool), - _column_return_type(column_type), + RuntimePredicateWrapper(PrimitiveType column_type, RuntimeFilterType type, uint32_t filter_id) + : _column_return_type(column_type), _filter_type(type), _context(new RuntimeFilterContext()), _filter_id(filter_id) {} @@ -566,51 +563,45 @@ class RuntimePredicateWrapper { switch (type) { case TYPE_BOOLEAN: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { bool bool_val = column.boolval(); set->insert(&bool_val); }); break; } case TYPE_TINYINT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - int8_t int_val = static_cast(column.intval()); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + auto int_val = static_cast(column.intval()); set->insert(&int_val); }); break; } case TYPE_SMALLINT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - int16_t int_val = static_cast(column.intval()); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + auto int_val = static_cast(column.intval()); set->insert(&int_val); }); break; } case TYPE_INT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { int32_t int_val = column.intval(); set->insert(&int_val); }); break; } case TYPE_BIGINT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { int64_t long_val = column.longval(); set->insert(&long_val); }); break; } case TYPE_LARGEINT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { auto string_val = column.stringval(); StringParser::ParseResult result; - int128_t int128_val = StringParser::string_to_int( + auto int128_val = StringParser::string_to_int( string_val.c_str(), string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); set->insert(&int128_val); @@ -618,32 +609,28 @@ class RuntimePredicateWrapper { break; } case TYPE_FLOAT: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - float float_val = static_cast(column.doubleval()); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + auto float_val = static_cast(column.doubleval()); set->insert(&float_val); }); break; } case TYPE_DOUBLE: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { double double_val = column.doubleval(); set->insert(&double_val); }); break; } case TYPE_DATEV2: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { auto date_v2_val = column.intval(); set->insert(&date_v2_val); }); break; } case TYPE_DATETIMEV2: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { auto date_v2_val = column.longval(); set->insert(&date_v2_val); }); @@ -651,9 +638,8 @@ class RuntimePredicateWrapper { } case TYPE_DATETIME: case TYPE_DATE: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - auto& string_val_ref = column.stringval(); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + const auto& string_val_ref = column.stringval(); VecDateTimeValue datetime_val; datetime_val.from_date_str(string_val_ref.c_str(), string_val_ref.length()); set->insert(&datetime_val); @@ -661,36 +647,32 @@ class RuntimePredicateWrapper { break; } case TYPE_DECIMALV2: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - auto& string_val_ref = column.stringval(); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + const auto& string_val_ref = column.stringval(); DecimalV2Value decimal_val(string_val_ref); set->insert(&decimal_val); }); break; } case TYPE_DECIMAL32: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { int32_t decimal_32_val = column.intval(); set->insert(&decimal_32_val); }); break; } case TYPE_DECIMAL64: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { int64_t decimal_64_val = column.longval(); set->insert(&decimal_64_val); }); break; } case TYPE_DECIMAL128I: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { auto string_val = column.stringval(); StringParser::ParseResult result; - int128_t int128_val = StringParser::string_to_int( + auto int128_val = StringParser::string_to_int( string_val.c_str(), string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); set->insert(&int128_val); @@ -698,8 +680,7 @@ class RuntimePredicateWrapper { break; } case TYPE_DECIMAL256: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { auto string_val = column.stringval(); StringParser::ParseResult result; auto int_val = StringParser::string_to_int( @@ -712,12 +693,9 @@ class RuntimePredicateWrapper { case TYPE_VARCHAR: case TYPE_CHAR: case TYPE_STRING: { - batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column, - ObjectPool* pool) { - auto& string_val_ref = column.stringval(); - auto val_ptr = pool->add(new std::string(string_val_ref)); - StringRef string_val(val_ptr->c_str(), val_ptr->length()); - set->insert(&string_val); + batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { + const auto& string_val_ref = column.stringval(); + set->insert(&string_val_ref); }); break; } @@ -761,13 +739,13 @@ class RuntimePredicateWrapper { return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_TINYINT: { - int8_t min_val = static_cast(minmax_filter->min_val().intval()); - int8_t max_val = static_cast(minmax_filter->max_val().intval()); + auto min_val = static_cast(minmax_filter->min_val().intval()); + auto max_val = static_cast(minmax_filter->max_val().intval()); return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_SMALLINT: { - int16_t min_val = static_cast(minmax_filter->min_val().intval()); - int16_t max_val = static_cast(minmax_filter->max_val().intval()); + auto min_val = static_cast(minmax_filter->min_val().intval()); + auto max_val = static_cast(minmax_filter->max_val().intval()); return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_INT: { @@ -784,22 +762,22 @@ class RuntimePredicateWrapper { auto min_string_val = minmax_filter->min_val().stringval(); auto max_string_val = minmax_filter->max_val().stringval(); StringParser::ParseResult result; - int128_t min_val = StringParser::string_to_int( - min_string_val.c_str(), min_string_val.length(), &result); + auto min_val = StringParser::string_to_int(min_string_val.c_str(), + min_string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); - int128_t max_val = StringParser::string_to_int( - max_string_val.c_str(), max_string_val.length(), &result); + auto max_val = StringParser::string_to_int(max_string_val.c_str(), + max_string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_FLOAT: { - float min_val = static_cast(minmax_filter->min_val().doubleval()); - float max_val = static_cast(minmax_filter->max_val().doubleval()); + auto min_val = static_cast(minmax_filter->min_val().doubleval()); + auto max_val = static_cast(minmax_filter->max_val().doubleval()); return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_DOUBLE: { - double min_val = static_cast(minmax_filter->min_val().doubleval()); - double max_val = static_cast(minmax_filter->max_val().doubleval()); + auto min_val = static_cast(minmax_filter->min_val().doubleval()); + auto max_val = static_cast(minmax_filter->max_val().doubleval()); return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_DATEV2: { @@ -814,8 +792,8 @@ class RuntimePredicateWrapper { } case TYPE_DATETIME: case TYPE_DATE: { - auto& min_val_ref = minmax_filter->min_val().stringval(); - auto& max_val_ref = minmax_filter->max_val().stringval(); + const auto& min_val_ref = minmax_filter->min_val().stringval(); + const auto& max_val_ref = minmax_filter->max_val().stringval(); VecDateTimeValue min_val; VecDateTimeValue max_val; min_val.from_date_str(min_val_ref.c_str(), min_val_ref.length()); @@ -823,8 +801,8 @@ class RuntimePredicateWrapper { return _context->minmax_func->assign(&min_val, &max_val); } case TYPE_DECIMALV2: { - auto& min_val_ref = minmax_filter->min_val().stringval(); - auto& max_val_ref = minmax_filter->max_val().stringval(); + const auto& min_val_ref = minmax_filter->min_val().stringval(); + const auto& max_val_ref = minmax_filter->max_val().stringval(); DecimalV2Value min_val(min_val_ref); DecimalV2Value max_val(max_val_ref); return _context->minmax_func->assign(&min_val, &max_val); @@ -843,11 +821,11 @@ class RuntimePredicateWrapper { auto min_string_val = minmax_filter->min_val().stringval(); auto max_string_val = minmax_filter->max_val().stringval(); StringParser::ParseResult result; - int128_t min_val = StringParser::string_to_int( - min_string_val.c_str(), min_string_val.length(), &result); + auto min_val = StringParser::string_to_int(min_string_val.c_str(), + min_string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); - int128_t max_val = StringParser::string_to_int( - max_string_val.c_str(), max_string_val.length(), &result); + auto max_val = StringParser::string_to_int(max_string_val.c_str(), + max_string_val.length(), &result); DCHECK(result == StringParser::PARSE_SUCCESS); return _context->minmax_func->assign(&min_val, &max_val); } @@ -866,13 +844,9 @@ class RuntimePredicateWrapper { case TYPE_VARCHAR: case TYPE_CHAR: case TYPE_STRING: { - auto& min_val_ref = minmax_filter->min_val().stringval(); - auto& max_val_ref = minmax_filter->max_val().stringval(); - auto min_val_ptr = _pool->add(new std::string(min_val_ref)); - auto max_val_ptr = _pool->add(new std::string(max_val_ref)); - StringRef min_val(min_val_ptr->c_str(), min_val_ptr->length()); - StringRef max_val(max_val_ptr->c_str(), max_val_ptr->length()); - return _context->minmax_func->assign(&min_val, &max_val); + auto min_val_ref = minmax_filter->min_val().stringval(); + auto max_val_ref = minmax_filter->max_val().stringval(); + return _context->minmax_func->assign(&min_val_ref, &max_val_ref); } default: break; @@ -915,10 +889,10 @@ class RuntimePredicateWrapper { void batch_assign(const PInFilter* filter, void (*assign_func)(std::shared_ptr& _hybrid_set, - PColumnValue&, ObjectPool*)) { + PColumnValue&)) { for (int i = 0; i < filter->values_size(); ++i) { PColumnValue column = filter->values(i); - assign_func(_context->hybrid_set, column, _pool); + assign_func(_context->hybrid_set, column); } } @@ -945,27 +919,25 @@ class RuntimePredicateWrapper { } private: - ObjectPool* _pool; - // When a runtime filter received from remote and it is a bloom filter, _column_return_type will be invalid. PrimitiveType _column_return_type; // column type RuntimeFilterType _filter_type; int32_t _max_in_num = -1; - SharedRuntimeFilterContext _context; + RuntimeFilterContextSPtr _context; uint32_t _filter_id; }; -Status IRuntimeFilter::create(RuntimeFilterParamsContext* state, ObjectPool* pool, - const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, - const RuntimeFilterRole role, int node_id, IRuntimeFilter** res, +Status IRuntimeFilter::create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, + const TQueryOptions* query_options, const RuntimeFilterRole role, + int node_id, std::shared_ptr* res, bool build_bf_exactly, bool need_local_merge) { - *res = pool->add(new IRuntimeFilter(state, pool, desc, need_local_merge)); + *res = std::make_shared(state, desc, need_local_merge); (*res)->set_role(role); return (*res)->init_with_desc(desc, query_options, node_id, build_bf_exactly); } -SharedRuntimeFilterContext& IRuntimeFilter::get_shared_context_ref() { +RuntimeFilterContextSPtr& IRuntimeFilter::get_shared_context_ref() { return _wrapper->_context; } @@ -983,12 +955,12 @@ Status IRuntimeFilter::publish(bool publish_local) { RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); return filter->push_to_remote(&addr); }; - auto send_to_local = [&](RuntimePredicateWrapper* wrapper) { - std::vector filters; + auto send_to_local = [&](std::shared_ptr wrapper) { + std::vector> filters; RETURN_IF_ERROR(_state->runtime_filter_mgr->get_consume_filters(_filter_id, filters)); DCHECK(!filters.empty()); // push down - for (auto* filter : filters) { + for (auto filter : filters) { filter->_wrapper = wrapper; filter->update_runtime_filter_type_to_profile(); filter->signal(); @@ -1000,13 +972,13 @@ Status IRuntimeFilter::publish(bool publish_local) { RETURN_IF_ERROR(_state->runtime_filter_mgr->get_local_merge_producer_filters( _filter_id, &local_merge_filters)); std::lock_guard l(*local_merge_filters->lock); - RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper)); + RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper.get())); local_merge_filters->merge_time--; if (local_merge_filters->merge_time == 0) { if (_has_local_target) { RETURN_IF_ERROR(send_to_local(local_merge_filters->filters[0]->_wrapper)); } else { - RETURN_IF_ERROR(send_to_remote(local_merge_filters->filters[0])); + RETURN_IF_ERROR(send_to_remote(local_merge_filters->filters[0].get())); } } return Status::OK(); @@ -1033,14 +1005,18 @@ Status IRuntimeFilter::publish(bool publish_local) { class SyncSizeClosure : public AutoReleaseClosure> { std::shared_ptr _dependency; - IRuntimeFilter* _filter; + // Should use weak ptr here, because when query context deconstructs, should also delete runtime filter + // context, it not the memory is not released. And rpc is in another thread, it will hold rf context + // after query context because the rpc is not returned. + std::weak_ptr _rf_context; + std::string _rf_debug_info; using Base = AutoReleaseClosure>; ENABLE_FACTORY_CREATOR(SyncSizeClosure); void _process_if_rpc_failed() override { ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); - LOG(WARNING) << "sync filter size meet rpc error, filter=" << _filter->debug_string(); + LOG(WARNING) << "sync filter size meet rpc error, filter=" << _rf_debug_info; Base::_process_if_rpc_failed(); } @@ -1048,10 +1024,15 @@ class SyncSizeClosure : public AutoReleaseClosuresub(); if (status.is()) { // rf merger backend may finished before rf's send_filter_size, we just ignore filter in this case. - _filter->set_ignored(); + auto ctx = _rf_context.lock(); + if (ctx) { + ctx->ignored = true; + } else { + LOG(WARNING) << "sync filter size returned but context is released, filter=" + << _rf_debug_info; + } } else { - LOG(WARNING) << "sync filter size meet error status, filter=" - << _filter->debug_string(); + LOG(WARNING) << "sync filter size meet error status, filter=" << _rf_debug_info; Base::_process_if_meet_error_status(status); } } @@ -1059,8 +1040,12 @@ class SyncSizeClosure : public AutoReleaseClosure req, std::shared_ptr> callback, - std::shared_ptr dependency, IRuntimeFilter* filter) - : Base(req, callback), _dependency(std::move(dependency)), _filter(filter) {} + std::shared_ptr dependency, + RuntimeFilterContextSPtr rf_context, std::string_view rf_debug_info) + : Base(req, callback), + _dependency(std::move(dependency)), + _rf_context(rf_context), + _rf_debug_info(rf_debug_info) {} }; Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filter_size) { @@ -1077,7 +1062,7 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt return Status::OK(); } else { if (_has_local_target) { - for (auto* filter : local_merge_filters->filters) { + for (auto filter : local_merge_filters->filters) { filter->set_synced_size(local_merge_filters->local_merged_size); } return Status::OK(); @@ -1103,7 +1088,10 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt auto request = std::make_shared(); auto callback = DummyBrpcCallback::create_shared(); - auto closure = SyncSizeClosure::create_unique(request, callback, _dependency, this); + // IRuntimeFilter maybe deconstructed before the rpc finished, so that could not use + // a raw pointer in closure. Has to use the context's shared ptr. + auto closure = SyncSizeClosure::create_unique(request, callback, _dependency, + _wrapper->_context, this->debug_string()); auto* pquery_id = request->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); pquery_id->set_lo(_state->query_id.lo()); @@ -1193,33 +1181,6 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listexecution_timeout * 1000; - auto runtime_filter_wait_time_ms = _state->runtime_filter_wait_time_ms; - // bitmap filter is precise filter and only filter once, so it must be applied. - int64_t wait_times_ms = _wrapper->get_real_type() == RuntimeFilterType::BITMAP_FILTER - ? execution_timeout - : runtime_filter_wait_time_ms; - auto expected = _rf_state_atomic.load(std::memory_order_acquire); - if (expected == RuntimeFilterState::NOT_READY) { - if (!_rf_state_atomic.compare_exchange_strong( - expected, - MonotonicMillis() - registration_time_ >= wait_times_ms - ? RuntimeFilterState::TIME_OUT - : RuntimeFilterState::NOT_READY, - std::memory_order_acq_rel)) { - DCHECK(expected == RuntimeFilterState::READY || - expected == RuntimeFilterState::TIME_OUT); - return (expected == RuntimeFilterState::READY); - } - return false; - } else if (expected == RuntimeFilterState::TIME_OUT) { - return false; - } - return true; -} - void IRuntimeFilter::update_state() { DCHECK(is_consumer()); auto execution_timeout = _state->execution_timeout * 1000; @@ -1325,6 +1286,9 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue params.filter_type = _runtime_filter_type; params.column_return_type = build_ctx->root()->type().type; params.max_in_num = options->runtime_filter_max_in_num; + params.runtime_bloom_filter_min_size = options->__isset.runtime_bloom_filter_min_size + ? options->runtime_bloom_filter_min_size + : 0; // We build runtime filter by exact distinct count iff three conditions are met: // 1. Only 1 join key // 2. Do not have remote target (e.g. do not need to merge), or broadcast join @@ -1372,7 +1336,7 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue _probe_expr = iter->second; } - _wrapper = _pool->add(new RuntimePredicateWrapper(_pool, ¶ms)); + _wrapper = std::make_shared(¶ms); return _wrapper->init(¶ms); } @@ -1388,22 +1352,22 @@ Status IRuntimeFilter::serialize(PPublishFilterRequestV2* request, void** data, return serialize_impl(request, data, len); } -Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, std::unique_ptr* wrapper) { - return _create_wrapper(param, pool, wrapper); + return _create_wrapper(param, wrapper); } -Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, std::unique_ptr* wrapper) { - return _create_wrapper(param, pool, wrapper); + return _create_wrapper(param, wrapper); } Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParamsV2* param, - RuntimePredicateWrapper** wrapper) { + std::shared_ptr* wrapper) { auto filter_type = param->request->filter_type(); PrimitiveType column_type = param->column_type; - *wrapper = param->pool->add(new RuntimePredicateWrapper( - param->pool, column_type, get_type(filter_type), param->request->filter_id())); + *wrapper = std::make_shared(column_type, get_type(filter_type), + param->request->filter_id()); if (param->request->has_ignored() && param->request->ignored()) { (*wrapper)->set_ignored(); @@ -1441,7 +1405,7 @@ Status IRuntimeFilter::init_bloom_filter(const size_t build_bf_cardinality) { } template -Status IRuntimeFilter::_create_wrapper(const T* param, ObjectPool* pool, +Status IRuntimeFilter::_create_wrapper(const T* param, std::unique_ptr* wrapper) { int filter_type = param->request->filter_type(); PrimitiveType column_type = PrimitiveType::INVALID_TYPE; @@ -1451,7 +1415,7 @@ Status IRuntimeFilter::_create_wrapper(const T* param, ObjectPool* pool, if (param->request->has_column_type()) { column_type = to_primitive_type(param->request->column_type()); } - *wrapper = std::make_unique(pool, column_type, get_type(filter_type), + *wrapper = std::make_unique(column_type, get_type(filter_type), param->request->filter_id()); if (param->request->has_ignored() && param->request->ignored()) { @@ -1552,7 +1516,7 @@ void IRuntimeFilter::to_protobuf(PInFilter* filter) { auto column_type = _wrapper->column_type(); filter->set_column_type(to_proto(column_type)); - auto it = _wrapper->get_in_filter_iterator(); + auto* it = _wrapper->get_in_filter_iterator(); DCHECK(it != nullptr); switch (column_type) { @@ -1663,8 +1627,8 @@ void IRuntimeFilter::to_protobuf(PInFilter* filter) { case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: { - batch_copy(filter, it, [](PColumnValue* column, const StringRef* value) { - column->set_stringval(std::string(value->data, value->size)); + batch_copy(filter, it, [](PColumnValue* column, const std::string* value) { + column->set_stringval(*value); }); return; } @@ -1684,8 +1648,8 @@ void IRuntimeFilter::to_protobuf(PMinMaxFilter* filter) { switch (_wrapper->column_type()) { case TYPE_BOOLEAN: { - filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); + filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); + filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); return; } case TYPE_TINYINT: { @@ -1778,12 +1742,10 @@ void IRuntimeFilter::to_protobuf(PMinMaxFilter* filter) { case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: { - const StringRef* min_string_value = reinterpret_cast(min_data); - filter->mutable_min_val()->set_stringval( - std::string(min_string_value->data, min_string_value->size)); - const StringRef* max_string_value = reinterpret_cast(max_data); - filter->mutable_max_val()->set_stringval( - std::string(max_string_value->data, max_string_value->size)); + const auto* min_string_value = reinterpret_cast(min_data); + filter->mutable_min_val()->set_stringval(*min_string_value); + const auto* max_string_value = reinterpret_cast(max_data); + filter->mutable_max_val()->set_stringval(*max_string_value); break; } default: { @@ -1810,7 +1772,7 @@ Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { set_ignored(); } else { std::unique_ptr wrapper; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, _pool, &wrapper)); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, &wrapper)); RETURN_IF_ERROR(_wrapper->merge(wrapper.get())); update_runtime_filter_type_to_profile(); } @@ -1819,8 +1781,8 @@ Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { return Status::OK(); } -void IRuntimeFilter::update_filter(RuntimePredicateWrapper* wrapper, int64_t merge_time, - int64_t start_apply) { +void IRuntimeFilter::update_filter(std::shared_ptr wrapper, + int64_t merge_time, int64_t start_apply) { _profile->add_info_string("UpdateTime", std::to_string(MonotonicMillis() - start_apply) + " ms"); _profile->add_info_string("MergeTime", std::to_string(merge_time) + " ms"); diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 390a61bfe1a628..f199e173e84cd4 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -48,7 +48,6 @@ class IOBufAsZeroCopyInputStream; } namespace doris { -class ObjectPool; class RuntimePredicateWrapper; class PPublishFilterRequest; class PPublishFilterRequestV2; @@ -65,7 +64,7 @@ class TQueryOptions; namespace vectorized { class VExpr; class VExprContext; -struct SharedRuntimeFilterContext; +struct RuntimeFilterContextSPtr; } // namespace vectorized namespace pipeline { @@ -128,6 +127,7 @@ struct RuntimeFilterParams { // used in bloom filter int64_t bloom_filter_size; int32_t max_in_num; + int64_t runtime_bloom_filter_min_size; int32_t filter_id; bool bitmap_filter_not_in; bool build_bf_exactly; @@ -157,17 +157,15 @@ struct RuntimeFilterFuncBase { struct UpdateRuntimeFilterParams { UpdateRuntimeFilterParams(const PPublishFilterRequest* req, - butil::IOBufAsZeroCopyInputStream* data_stream, ObjectPool* obj_pool) - : request(req), data(data_stream), pool(obj_pool) {} + butil::IOBufAsZeroCopyInputStream* data_stream) + : request(req), data(data_stream) {} const PPublishFilterRequest* request = nullptr; butil::IOBufAsZeroCopyInputStream* data = nullptr; - ObjectPool* pool = nullptr; }; struct UpdateRuntimeFilterParamsV2 { const PPublishFilterRequestV2* request; butil::IOBufAsZeroCopyInputStream* data; - ObjectPool* pool = nullptr; PrimitiveType column_type = INVALID_TYPE; }; @@ -192,10 +190,9 @@ enum RuntimeFilterState { /// that can be pushed down to node based on the results of the right table. class IRuntimeFilter { public: - IRuntimeFilter(RuntimeFilterParamsContext* state, ObjectPool* pool, - const TRuntimeFilterDesc* desc, bool need_local_merge = false) + IRuntimeFilter(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, + bool need_local_merge = false) : _state(state), - _pool(pool), _filter_id(desc->filter_id), _is_broadcast_join(true), _has_remote_target(false), @@ -215,12 +212,12 @@ class IRuntimeFilter { ~IRuntimeFilter() = default; - static Status create(RuntimeFilterParamsContext* state, ObjectPool* pool, - const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, - const RuntimeFilterRole role, int node_id, IRuntimeFilter** res, + static Status create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, + const TQueryOptions* query_options, const RuntimeFilterRole role, + int node_id, std::shared_ptr* res, bool build_bf_exactly = false, bool need_local_merge = false); - SharedRuntimeFilterContext& get_shared_context_ref(); + RuntimeFilterContextSPtr& get_shared_context_ref(); // insert data to build filter void insert_batch(vectorized::ColumnPtr column, size_t start); @@ -257,12 +254,6 @@ class IRuntimeFilter { void set_role(const RuntimeFilterRole role) { _role = role; } int expr_order() const { return _expr_order; } - // only used for consumer - // if filter is not ready for filter data scan_node - // will wait util it ready or timeout - // This function will wait at most config::runtime_filter_shuffle_wait_time_ms - // if return true , filter is ready to use - bool await(); void update_state(); // this function will be called if a runtime filter sent by rpc // it will notify all wait threads @@ -281,17 +272,17 @@ class IRuntimeFilter { Status merge_from(const RuntimePredicateWrapper* wrapper); - static Status create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, + static Status create_wrapper(const MergeRuntimeFilterParams* param, std::unique_ptr* wrapper); - static Status create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, + static Status create_wrapper(const UpdateRuntimeFilterParams* param, std::unique_ptr* wrapper); static Status create_wrapper(const UpdateRuntimeFilterParamsV2* param, - RuntimePredicateWrapper** wrapper); + std::shared_ptr* wrapper); Status change_to_bloom_filter(); Status init_bloom_filter(const size_t build_bf_cardinality); Status update_filter(const UpdateRuntimeFilterParams* param); - void update_filter(RuntimePredicateWrapper* filter_wrapper, int64_t merge_time, + void update_filter(std::shared_ptr filter_wrapper, int64_t merge_time, int64_t start_apply); void set_ignored(); @@ -381,7 +372,7 @@ class IRuntimeFilter { Status serialize_impl(T* request, void** data, int* len); template - static Status _create_wrapper(const T* param, ObjectPool* pool, + static Status _create_wrapper(const T* param, std::unique_ptr* wrapper); void _set_push_down(bool push_down) { _is_push_down = push_down; } @@ -395,10 +386,8 @@ class IRuntimeFilter { } RuntimeFilterParamsContext* _state = nullptr; - ObjectPool* _pool = nullptr; // _wrapper is a runtime filter function wrapper - // _wrapper should alloc from _pool - RuntimePredicateWrapper* _wrapper = nullptr; + std::shared_ptr _wrapper; // runtime filter id int _filter_id; // Specific types BoardCast or Shuffle diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index ebda4b56fcc30e..c0a249cd6b063d 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -34,10 +34,10 @@ class VRuntimeFilterSlots { public: VRuntimeFilterSlots( const std::vector>& build_expr_ctxs, - const std::vector& runtime_filters) + const std::vector>& runtime_filters) : _build_expr_context(build_expr_ctxs), _runtime_filters(runtime_filters) { - for (auto* runtime_filter : _runtime_filters) { - _runtime_filters_map[runtime_filter->expr_order()].push_back(runtime_filter); + for (auto runtime_filter : _runtime_filters) { + _runtime_filters_map[runtime_filter->expr_order()].push_back(runtime_filter.get()); } } @@ -46,14 +46,14 @@ class VRuntimeFilterSlots { if (_runtime_filters.empty()) { return Status::OK(); } - for (auto* runtime_filter : _runtime_filters) { + for (auto runtime_filter : _runtime_filters) { if (runtime_filter->need_sync_filter_size()) { runtime_filter->set_dependency(dependency); } } // send_filter_size may call dependency->sub(), so we call set_dependency firstly for all rf to avoid dependency set_ready repeatedly - for (auto* runtime_filter : _runtime_filters) { + for (auto runtime_filter : _runtime_filters) { if (runtime_filter->need_sync_filter_size()) { RETURN_IF_ERROR(runtime_filter->send_filter_size(state, hash_table_size)); } @@ -70,7 +70,7 @@ class VRuntimeFilterSlots { Status ignore_filters(RuntimeState* state) { // process ignore duplicate IN_FILTER std::unordered_set has_in_filter; - for (auto* filter : _runtime_filters) { + for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; } @@ -85,7 +85,7 @@ class VRuntimeFilterSlots { } // process ignore filter when it has IN_FILTER on same expr, and init bloom filter size - for (auto* filter : _runtime_filters) { + for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; } @@ -100,12 +100,13 @@ class VRuntimeFilterSlots { Status init_filters(RuntimeState* state, uint64_t local_hash_table_size) { // process IN_OR_BLOOM_FILTER's real type - for (auto* filter : _runtime_filters) { + for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; } if (filter->type() == RuntimeFilterType::IN_OR_BLOOM_FILTER && - get_real_size(filter, local_hash_table_size) > state->runtime_filter_max_in_num()) { + get_real_size(filter.get(), local_hash_table_size) > + state->runtime_filter_max_in_num()) { RETURN_IF_ERROR(filter->change_to_bloom_filter()); } @@ -114,8 +115,8 @@ class VRuntimeFilterSlots { return Status::InternalError("sync filter size meet error, filter: {}", filter->debug_string()); } - RETURN_IF_ERROR( - filter->init_bloom_filter(get_real_size(filter, local_hash_table_size))); + RETURN_IF_ERROR(filter->init_bloom_filter( + get_real_size(filter.get(), local_hash_table_size))); } } return Status::OK(); @@ -175,7 +176,7 @@ class VRuntimeFilterSlots { private: const std::vector>& _build_expr_context; - std::vector _runtime_filters; + std::vector> _runtime_filters; // prob_contition index -> [IRuntimeFilter] std::map> _runtime_filters_map; }; diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h index 1d496ddf5571e6..01ae21a75992de 100644 --- a/be/src/exprs/runtime_filter_slots_cross.h +++ b/be/src/exprs/runtime_filter_slots_cross.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "common/status.h" @@ -34,14 +35,14 @@ namespace doris { // this class used in cross join node class VRuntimeFilterSlotsCross { public: - VRuntimeFilterSlotsCross(const std::vector& runtime_filters, - const vectorized::VExprContextSPtrs& src_expr_ctxs) - : _runtime_filters(runtime_filters), filter_src_expr_ctxs(src_expr_ctxs) {} + VRuntimeFilterSlotsCross(const std::vector>& runtime_filters, + vectorized::VExprContextSPtrs src_expr_ctxs) + : _runtime_filters(runtime_filters), filter_src_expr_ctxs(std::move(src_expr_ctxs)) {} ~VRuntimeFilterSlotsCross() = default; Status init(RuntimeState* state) { - for (auto* runtime_filter : _runtime_filters) { + for (auto runtime_filter : _runtime_filters) { if (runtime_filter == nullptr) { return Status::InternalError("runtime filter is nullptr"); } @@ -56,7 +57,7 @@ class VRuntimeFilterSlotsCross { Status insert(vectorized::Block* block) { for (int i = 0; i < _runtime_filters.size(); ++i) { - auto* filter = _runtime_filters[i]; + auto filter = _runtime_filters[i]; const auto& vexpr_ctx = filter_src_expr_ctxs[i]; int result_column_id = -1; @@ -72,7 +73,7 @@ class VRuntimeFilterSlotsCross { } Status publish() { - for (auto& filter : _runtime_filters) { + for (auto filter : _runtime_filters) { RETURN_IF_ERROR(filter->publish()); } return Status::OK(); @@ -81,7 +82,7 @@ class VRuntimeFilterSlotsCross { bool empty() const { return _runtime_filters.empty(); } private: - const std::vector& _runtime_filters; + const std::vector>& _runtime_filters; const vectorized::VExprContextSPtrs filter_src_expr_ctxs; }; diff --git a/be/src/http/action/calc_file_crc_action.cpp b/be/src/http/action/calc_file_crc_action.cpp index c713184ddfda9b..66ec96a2a9ac65 100644 --- a/be/src/http/action/calc_file_crc_action.cpp +++ b/be/src/http/action/calc_file_crc_action.cpp @@ -25,6 +25,7 @@ #include #include +#include "cloud/cloud_storage_engine.h" #include "common/logging.h" #include "common/status.h" #include "http/http_channel.h" @@ -38,7 +39,7 @@ namespace doris { using namespace ErrorCode; -CalcFileCrcAction::CalcFileCrcAction(ExecEnv* exec_env, StorageEngine& engine, +CalcFileCrcAction::CalcFileCrcAction(ExecEnv* exec_env, BaseStorageEngine& engine, TPrivilegeHier::type hier, TPrivilegeType::type ptype) : HttpHandlerWithAuth(exec_env, hier, ptype), _engine(engine) {} @@ -58,16 +59,28 @@ Status CalcFileCrcAction::_handle_calc_crc(HttpRequest* req, uint32_t* crc_value return Status::InternalError("convert tablet id or failed, {}", e.what()); } - TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(tablet_id); + BaseTabletSPtr tablet = nullptr; + + if (auto cloudEngine = dynamic_cast(&_engine)) { + tablet = DORIS_TRY(cloudEngine->get_tablet(tablet_id)); + // sync all rowsets + RETURN_IF_ERROR(std::dynamic_pointer_cast(tablet)->sync_rowsets(-1)); + } else if (auto storageEngine = dynamic_cast(&_engine)) { + auto tabletPtr = storageEngine->tablet_manager()->get_tablet(tablet_id); + tablet = std::dynamic_pointer_cast(tabletPtr); + } else { + return Status::InternalError("convert _engine failed"); + } + if (tablet == nullptr) { - return Status::NotFound("Tablet not found. tablet_id={}", tablet_id); + return Status::NotFound("failed to get tablet {}", tablet_id); } const auto& req_start_version = req->param(PARAM_START_VERSION); const auto& req_end_version = req->param(PARAM_END_VERSION); *start_version = 0; - *end_version = tablet->max_version().second; + *end_version = tablet->max_version_unlocked(); if (!req_start_version.empty()) { try { @@ -85,8 +98,8 @@ Status CalcFileCrcAction::_handle_calc_crc(HttpRequest* req, uint32_t* crc_value } } - auto st = tablet->calc_local_file_crc(crc_value, *start_version, *end_version, rowset_count, - file_count); + auto st = tablet->calc_file_crc(crc_value, *start_version, *end_version, rowset_count, + file_count); if (!st.ok()) { return st; } diff --git a/be/src/http/action/calc_file_crc_action.h b/be/src/http/action/calc_file_crc_action.h index 2c0d19f0ca089e..30df8bfe629cf3 100644 --- a/be/src/http/action/calc_file_crc_action.h +++ b/be/src/http/action/calc_file_crc_action.h @@ -26,7 +26,7 @@ namespace doris { class HttpRequest; -class StorageEngine; +class BaseStorageEngine; class ExecEnv; const std::string PARAM_START_VERSION = "start_version"; @@ -35,7 +35,7 @@ const std::string PARAM_END_VERSION = "end_version"; // This action is used to calculate the crc value of the files in the tablet. class CalcFileCrcAction : public HttpHandlerWithAuth { public: - CalcFileCrcAction(ExecEnv* exec_env, StorageEngine& engine, TPrivilegeHier::type hier, + CalcFileCrcAction(ExecEnv* exec_env, BaseStorageEngine& engine, TPrivilegeHier::type hier, TPrivilegeType::type ptype); ~CalcFileCrcAction() override = default; @@ -47,7 +47,7 @@ class CalcFileCrcAction : public HttpHandlerWithAuth { int64_t* end_version, int32_t* rowset_count, int64_t* file_count); private: - StorageEngine& _engine; + BaseStorageEngine& _engine; }; } // end namespace doris diff --git a/be/src/http/action/clear_cache_action.cpp b/be/src/http/action/clear_cache_action.cpp index f42499090c42ae..cb183a99cf1502 100644 --- a/be/src/http/action/clear_cache_action.cpp +++ b/be/src/http/action/clear_cache_action.cpp @@ -30,10 +30,37 @@ namespace doris { const static std::string HEADER_JSON = "application/json"; -void ClearDataCacheAction::handle(HttpRequest* req) { +void ClearCacheAction::handle(HttpRequest* req) { req->add_output_header(HttpHeaders::CONTENT_TYPE, "text/plain; version=0.0.4"); - CacheManager::instance()->clear_once(); - HttpChannel::send_reply(req, HttpStatus::OK, ""); + std::string cache_type_str = req->param("type"); + fmt::memory_buffer return_string_buffer; + int64_t freed_size = 0; + if (cache_type_str == "all") { + freed_size = CacheManager::instance()->for_each_cache_prune_all(nullptr, true); + } else { + CachePolicy::CacheType cache_type = CachePolicy::string_to_type(cache_type_str); + if (cache_type == CachePolicy::CacheType::NONE) { + fmt::format_to(return_string_buffer, + "ClearCacheAction not match type:{} of cache policy", cache_type_str); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + fmt::to_string(return_string_buffer)); + return; + } + freed_size = CacheManager::instance()->cache_prune_all(cache_type, true); + if (freed_size == -1) { + fmt::format_to(return_string_buffer, + "ClearCacheAction cache:{} is not allowed to be pruned", cache_type_str); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + fmt::to_string(return_string_buffer)); + return; + } + } + fmt::format_to(return_string_buffer, "ClearCacheAction cache:{} prune win, freed size {}", + cache_type_str, freed_size); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::OK, fmt::to_string(return_string_buffer)); } } // end namespace doris diff --git a/be/src/http/action/clear_cache_action.h b/be/src/http/action/clear_cache_action.h index 3840f63593f98f..3795a87b5d76ff 100644 --- a/be/src/http/action/clear_cache_action.h +++ b/be/src/http/action/clear_cache_action.h @@ -23,11 +23,11 @@ namespace doris { class HttpRequest; -class ClearDataCacheAction : public HttpHandler { +class ClearCacheAction : public HttpHandler { public: - ClearDataCacheAction() = default; + ClearCacheAction() = default; - ~ClearDataCacheAction() override = default; + ~ClearCacheAction() override = default; void handle(HttpRequest* req) override; }; diff --git a/be/src/http/action/clear_file_cache_action.cpp b/be/src/http/action/clear_file_cache_action.cpp deleted file mode 100644 index 6a4a2517508824..00000000000000 --- a/be/src/http/action/clear_file_cache_action.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "http/action/clear_file_cache_action.h" - -#include - -#include "common/logging.h" -#include "http/http_channel.h" -#include "http/http_headers.h" -#include "http/http_request.h" -#include "io/cache/block_file_cache_factory.h" - -namespace doris { - -const std::string SYNC = "sync"; - -void ClearFileCacheAction::handle(HttpRequest* req) { - req->add_output_header(HttpHeaders::CONTENT_TYPE, "application/json"); - std::string sync = req->param(SYNC); - auto ret = - io::FileCacheFactory::instance()->clear_file_caches(sync == "TRUE" || sync == "true"); - HttpChannel::send_reply(req, HttpStatus::OK, ret); -} - -} // namespace doris diff --git a/be/src/http/action/clear_file_cache_action.h b/be/src/http/action/clear_file_cache_action.h deleted file mode 100644 index 25ebdd7cb5efab..00000000000000 --- a/be/src/http/action/clear_file_cache_action.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "http/http_handler.h" - -namespace doris { -class ExecEnv; -class ClearFileCacheAction : public HttpHandler { -public: - ClearFileCacheAction() = default; - - ~ClearFileCacheAction() override = default; - - void handle(HttpRequest* req) override; -}; -} // namespace doris diff --git a/be/src/http/action/compaction_action.cpp b/be/src/http/action/compaction_action.cpp index 787bc23471b186..c3fb5638c7ca1b 100644 --- a/be/src/http/action/compaction_action.cpp +++ b/be/src/http/action/compaction_action.cpp @@ -202,7 +202,7 @@ Status CompactionAction::_handle_run_status_compaction(HttpRequest* req, std::st if (tablet_id == 0) { // overall compaction status - RETURN_IF_ERROR(_engine.get_compaction_status_json(json_result)); + _engine.get_compaction_status_json(json_result); return Status::OK(); } else { // fetch the tablet by tablet_id diff --git a/be/src/http/action/file_cache_action.cpp b/be/src/http/action/file_cache_action.cpp index cee37f2115d6c7..acad2b3b7bf96c 100644 --- a/be/src/http/action/file_cache_action.cpp +++ b/be/src/http/action/file_cache_action.cpp @@ -33,25 +33,59 @@ namespace doris { -const static std::string HEADER_JSON = "application/json"; -const static std::string OP = "op"; +constexpr static std::string_view HEADER_JSON = "application/json"; +constexpr static std::string_view OP = "op"; +constexpr static std::string_view SYNC = "sync"; +constexpr static std::string_view PATH = "path"; +constexpr static std::string_view CLEAR = "clear"; +constexpr static std::string_view RESET = "reset"; +constexpr static std::string_view CAPACITY = "capacity"; +constexpr static std::string_view RELEASE = "release"; +constexpr static std::string_view BASE_PATH = "base_path"; +constexpr static std::string_view RELEASED_ELEMENTS = "released_elements"; Status FileCacheAction::_handle_header(HttpRequest* req, std::string* json_metrics) { - req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); - std::string operation = req->param(OP); - if (operation == "release") { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.data()); + std::string operation = req->param(OP.data()); + Status st = Status::OK(); + if (operation == RELEASE) { size_t released = 0; - if (req->param("base_path") != "") { - released = io::FileCacheFactory::instance()->try_release(req->param("base_path")); + const std::string& base_path = req->param(BASE_PATH.data()); + if (!base_path.empty()) { + released = io::FileCacheFactory::instance()->try_release(base_path); } else { released = io::FileCacheFactory::instance()->try_release(); } EasyJson json; - json["released_elements"] = released; + json[RELEASED_ELEMENTS.data()] = released; *json_metrics = json.ToString(); - return Status::OK(); + } else if (operation == CLEAR) { + const std::string& sync = req->param(SYNC.data()); + auto ret = io::FileCacheFactory::instance()->clear_file_caches(to_lower(sync) == "true"); + } else if (operation == RESET) { + Status st; + std::string capacity = req->param(CAPACITY.data()); + int64_t new_capacity = 0; + bool parse = true; + try { + new_capacity = std::stoll(capacity); + } catch (...) { + parse = false; + } + if (!parse || new_capacity <= 0) { + st = Status::InvalidArgument( + "The capacity {} failed to be parsed, the capacity needs to be in " + "the interval (0, INT64_MAX]", + capacity); + } else { + const std::string& path = req->param(PATH.data()); + auto ret = io::FileCacheFactory::instance()->reset_capacity(path, new_capacity); + LOG(INFO) << ret; + } + } else { + st = Status::InternalError("invalid operation: {}", operation); } - return Status::InternalError("invalid operation: {}", operation); + return st; } void FileCacheAction::handle(HttpRequest* req) { diff --git a/be/src/http/action/http_stream.cpp b/be/src/http/action/http_stream.cpp index a3439969e60ba2..87cc2f694eb102 100644 --- a/be/src/http/action/http_stream.cpp +++ b/be/src/http/action/http_stream.cpp @@ -30,6 +30,7 @@ #include #include +#include "cloud/cloud_storage_engine.h" #include "cloud/config.h" #include "common/config.h" #include "common/consts.h" @@ -119,7 +120,7 @@ void HttpStreamAction::handle(HttpRequest* req) { // add new line at end str = str + '\n'; HttpChannel::send_reply(req, str); - if (config::enable_stream_load_record && !config::is_cloud_mode()) { + if (config::enable_stream_load_record) { str = ctx->prepare_stream_load_record(str); _save_stream_load_record(ctx, str); } @@ -364,8 +365,9 @@ Status HttpStreamAction::process_put(HttpRequest* http_req, void HttpStreamAction::_save_stream_load_record(std::shared_ptr ctx, const std::string& str) { - auto stream_load_recorder = - ExecEnv::GetInstance()->storage_engine().to_local().get_stream_load_recorder(); + std::shared_ptr stream_load_recorder = + ExecEnv::GetInstance()->storage_engine().get_stream_load_recorder(); + if (stream_load_recorder != nullptr) { std::string key = std::to_string(ctx->start_millis + ctx->load_cost_millis) + "_" + ctx->label; diff --git a/be/src/http/action/reset_rpc_channel_action.cpp b/be/src/http/action/reset_rpc_channel_action.cpp index e1b180a61d420a..b14c0f65e7f4f1 100644 --- a/be/src/http/action/reset_rpc_channel_action.cpp +++ b/be/src/http/action/reset_rpc_channel_action.cpp @@ -34,7 +34,7 @@ namespace doris { ResetRPCChannelAction::ResetRPCChannelAction(ExecEnv* exec_env, TPrivilegeHier::type hier, TPrivilegeType::type type) - : HttpHandlerWithAuth(exec_env, hier, type) {} + : HttpHandlerWithAuth(exec_env, hier, type), _exec_env(exec_env) {} void ResetRPCChannelAction::handle(HttpRequest* req) { std::string endpoints = req->param("endpoints"); if (iequal(endpoints, "all")) { diff --git a/be/src/http/action/show_nested_index_file_action.cpp b/be/src/http/action/show_nested_index_file_action.cpp new file mode 100644 index 00000000000000..ba5e0fc699b874 --- /dev/null +++ b/be/src/http/action/show_nested_index_file_action.cpp @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "http/action/show_nested_index_file_action.h" + +#include + +#include +#include + +#include "common/status.h" +#include "http/http_channel.h" +#include "http/http_headers.h" +#include "http/http_request.h" +#include "http/http_status.h" +#include "olap/storage_engine.h" +#include "olap/tablet_manager.h" +#include "util/stopwatch.hpp" + +namespace doris { +using namespace ErrorCode; + +const static std::string HEADER_JSON = "application/json"; + +ShowNestedIndexFileAction::ShowNestedIndexFileAction(ExecEnv* exec_env, TPrivilegeHier::type hier, + TPrivilegeType::type ptype) + : HttpHandlerWithAuth(exec_env, hier, ptype) {} + +// show the nested inverted index file in the tablet +Status ShowNestedIndexFileAction::_handle_show_nested_index_file(HttpRequest* req, + std::string* json_meta) { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); + std::string req_tablet_id = req->param(TABLET_ID_KEY); + uint64_t tablet_id = 0; + try { + tablet_id = std::stoull(req_tablet_id); + } catch (const std::exception& e) { + LOG(WARNING) << "invalid argument.tablet_id:" << req_tablet_id; + return Status::InternalError("convert failed, {}", e.what()); + } + + auto tablet = DORIS_TRY(ExecEnv::get_tablet(tablet_id)); + RETURN_IF_ERROR(tablet->show_nested_index_file(json_meta)); + return Status::OK(); +} + +void ShowNestedIndexFileAction::handle(HttpRequest* req) { + MonotonicStopWatch timer; + timer.start(); + + std::string json_meta; + Status status = _handle_show_nested_index_file(req, &json_meta); + std::string status_result = status.to_json(); + timer.stop(); + LOG(INFO) << "handle show_nested_index_file request finished, result:" << status_result + << ", use time = " << timer.elapsed_time() / 1000000 << "ms"; + if (status.ok()) { + HttpChannel::send_reply(req, HttpStatus::OK, json_meta); + } else { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, status_result); + } +} + +} // end namespace doris diff --git a/be/src/http/action/show_nested_index_file_action.h b/be/src/http/action/show_nested_index_file_action.h new file mode 100644 index 00000000000000..913eec0aa27a7e --- /dev/null +++ b/be/src/http/action/show_nested_index_file_action.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "common/status.h" +#include "http/http_handler_with_auth.h" + +namespace doris { +class HttpRequest; +class BaseStorageEngine; +class ExecEnv; + +// This action is used to show nested inverted index file in tablet +class ShowNestedIndexFileAction : public HttpHandlerWithAuth { +public: + ShowNestedIndexFileAction(ExecEnv* exec_env, TPrivilegeHier::type hier, + TPrivilegeType::type ptype); + + ~ShowNestedIndexFileAction() override = default; + + void handle(HttpRequest* req) override; + +private: + Status _handle_show_nested_index_file(HttpRequest* req, std::string* json_header); +}; + +} // end namespace doris diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 93fde511898dd3..64becf8d7e3369 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -39,6 +39,7 @@ #include #include +#include "cloud/cloud_storage_engine.h" #include "cloud/config.h" #include "common/config.h" #include "common/consts.h" @@ -145,7 +146,7 @@ Status StreamLoadAction::_handle(std::shared_ptr ctx) { if (ctx->body_bytes > 0 && ctx->receive_bytes != ctx->body_bytes) { LOG(WARNING) << "recevie body don't equal with body bytes, body_bytes=" << ctx->body_bytes << ", receive_bytes=" << ctx->receive_bytes << ", id=" << ctx->id; - return Status::InternalError("receive body don't equal with body bytes"); + return Status::InternalError("receive body don't equal with body bytes"); } // if we use non-streaming, MessageBodyFileSink.finish will close the file @@ -217,7 +218,7 @@ int StreamLoadAction::on_header(HttpRequest* req) { str = str + '\n'; HttpChannel::send_reply(req, str); #ifndef BE_TEST - if (config::enable_stream_load_record && !config::is_cloud_mode()) { + if (config::enable_stream_load_record) { str = ctx->prepare_stream_load_record(str); _save_stream_load_record(ctx, str); } @@ -231,13 +232,13 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrauth)) { LOG(WARNING) << "parse basic authorization failed." << ctx->brief(); - return Status::InternalError("no valid Basic authorization"); + return Status::InternalError("no valid Basic authorization"); } // get format of this put if (!http_req->header(HTTP_COMPRESS_TYPE).empty() && iequal(http_req->header(HTTP_FORMAT_KEY), "JSON")) { - return Status::InternalError("compress data of JSON format is not supported."); + return Status::InternalError("compress data of JSON format is not supported."); } std::string format_str = http_req->header(HTTP_FORMAT_KEY); if (iequal(format_str, BeConsts::CSV_WITH_NAMES) || @@ -253,8 +254,8 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrheader(HTTP_COMPRESS_TYPE), &ctx->format, &ctx->compress_type); if (ctx->format == TFileFormatType::FORMAT_UNKNOWN) { - return Status::InternalError("unknown data format, format={}", - http_req->header(HTTP_FORMAT_KEY)); + return Status::InternalError("unknown data format, format={}", + http_req->header(HTTP_FORMAT_KEY)); } // check content length @@ -272,7 +273,7 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrformat == TFileFormatType::FORMAT_JSON) && (ctx->body_bytes > json_max_body_bytes) && !read_json_by_line) { - return Status::InternalError( + return Status::InternalError( "The size of this batch exceed the max size [{}] of json type data " " data [ {} ]. Split the file, or use 'read_json_by_line'", json_max_body_bytes, ctx->body_bytes); @@ -280,8 +281,8 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrbody_bytes > csv_max_body_bytes) { LOG(WARNING) << "body exceed max size." << ctx->brief(); - return Status::InternalError("body exceed max size: {}, data: {}", csv_max_body_bytes, - ctx->body_bytes); + return Status::InternalError("body exceed max size: {}, data: {}", + csv_max_body_bytes, ctx->body_bytes); } } else { #ifndef BE_TEST @@ -299,13 +300,14 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptris_chunked_transfer))) { LOG(WARNING) << "content_length is empty and transfer-encoding!=chunked, please set " "content_length or transfer-encoding=chunked"; - return Status::InternalError( + return Status::InternalError( "content_length is empty and transfer-encoding!=chunked, please set content_length " "or transfer-encoding=chunked"); } else if (UNLIKELY(!http_req->header(HttpHeaders::CONTENT_LENGTH).empty() && ctx->is_chunked_transfer)) { LOG(WARNING) << "please do not set both content_length and transfer-encoding"; - return Status::InternalError("please do not set both content_length and transfer-encoding"); + return Status::InternalError( + "please do not set both content_length and transfer-encoding"); } if (!http_req->header(HTTP_TIMEOUT).empty()) { @@ -705,8 +707,9 @@ Status StreamLoadAction::_data_saved_path(HttpRequest* req, std::string* file_pa void StreamLoadAction::_save_stream_load_record(std::shared_ptr ctx, const std::string& str) { - auto stream_load_recorder = - ExecEnv::GetInstance()->storage_engine().to_local().get_stream_load_recorder(); + std::shared_ptr stream_load_recorder = + ExecEnv::GetInstance()->storage_engine().get_stream_load_recorder(); + if (stream_load_recorder != nullptr) { std::string key = std::to_string(ctx->start_millis + ctx->load_cost_millis) + "_" + ctx->label; @@ -725,7 +728,8 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, std::string group_commit_mode = req->header(HTTP_GROUP_COMMIT); if (!group_commit_mode.empty() && !iequal(group_commit_mode, "sync_mode") && !iequal(group_commit_mode, "async_mode") && !iequal(group_commit_mode, "off_mode")) { - return Status::InternalError("group_commit can only be [async_mode, sync_mode, off_mode]"); + return Status::InternalError( + "group_commit can only be [async_mode, sync_mode, off_mode]"); } if (config::wait_internal_group_commit_finish) { group_commit_mode = "sync_mode"; @@ -738,7 +742,7 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, ss << "This stream load content length <0 (" << content_length << "), please check your content length."; LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); + return Status::InternalError(ss.str()); } // allow chunked stream load in flink auto is_chunk = !req->header(HttpHeaders::TRANSFER_ENCODING).empty() && @@ -759,7 +763,8 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, auto partitions = !req->header(HTTP_PARTITIONS).empty(); if (!partial_columns && !partitions && !temp_partitions && !ctx->two_phase_commit) { if (!config::wait_internal_group_commit_finish && !ctx->label.empty()) { - return Status::InternalError("label and group_commit can't be set at the same time"); + return Status::InternalError( + "label and group_commit can't be set at the same time"); } ctx->group_commit = true; if (iequal(group_commit_mode, "async_mode")) { diff --git a/be/src/http/action/tablet_migration_action.cpp b/be/src/http/action/tablet_migration_action.cpp index 721f5ca7af70bb..11478563b47176 100644 --- a/be/src/http/action/tablet_migration_action.cpp +++ b/be/src/http/action/tablet_migration_action.cpp @@ -74,7 +74,8 @@ void TabletMigrationAction::handle(HttpRequest* req) { } _migration_tasks[current_task] = "submitted"; } - auto st = _migration_thread_pool->submit_func([&, dest_disk, current_task]() { + auto st = _migration_thread_pool->submit_func([&, tablet, dest_store, + current_task]() { { std::unique_lock lock(_migration_status_mutex); _migration_tasks[current_task] = "running"; diff --git a/be/src/io/CMakeLists.txt b/be/src/io/CMakeLists.txt index 09b5bbd94e9541..02b34f2f0ea861 100644 --- a/be/src/io/CMakeLists.txt +++ b/be/src/io/CMakeLists.txt @@ -22,6 +22,9 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/io") set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/io") file(GLOB_RECURSE IO_FILES CONFIGURE_DEPENDS *.cpp) +if(BUILD_AZURE STREQUAL "OFF") + list(REMOVE_ITEM IO_FILES "${CMAKE_CURRENT_SOURCE_DIR}/fs/azure_obj_storage_client.cpp") +endif() list(REMOVE_ITEM IO_FILES "${CMAKE_CURRENT_SOURCE_DIR}/fs/benchmark/fs_benchmark_tool.cpp") add_library(IO STATIC ${IO_FILES}) diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 6c320a21bb3630..d7dfc743a8766e 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -157,7 +157,7 @@ void BlockFileCache::remove_query_context(const TUniqueId& query_id) { std::lock_guard cache_lock(_mutex); const auto& query_iter = _query_map.find(query_id); - if (query_iter != _query_map.end() && query_iter->second.unique()) { + if (query_iter != _query_map.end() && query_iter->second.use_count() <= 1) { _query_map.erase(query_iter); } } @@ -280,6 +280,12 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte DCHECK(!file_blocks.empty()); // change to ttl if the blocks aren't ttl if (context.cache_type == FileCacheType::TTL && _key_to_time.find(hash) == _key_to_time.end()) { + for (auto& [_, cell] : file_blocks) { + Status st = cell.file_block->update_expiration_time(context.expiration_time); + if (!st.ok()) { + LOG_WARNING("Failed to change key meta").error(st); + } + } for (auto& [_, cell] : file_blocks) { FileCacheType origin_type = cell.file_block->cache_type(); if (origin_type == FileCacheType::TTL) continue; @@ -295,9 +301,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte } else { cell.queue_iterator.reset(); } - st = cell.file_block->update_expiration_time(context.expiration_time); - } - if (!st.ok()) { + } else { LOG_WARNING("Failed to change key meta").error(st); } } @@ -324,7 +328,10 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte } if (context.expiration_time == 0) { for (auto& [_, cell] : file_blocks) { - if (cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL)) { + auto cache_type = cell.file_block->cache_type(); + if (cache_type != FileCacheType::TTL) continue; + auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + if (st.ok()) { if (config::enable_ttl_cache_evict_using_lru) { auto& ttl_queue = get_queue(FileCacheType::TTL); ttl_queue.remove(cell.queue_iterator.value(), cache_lock); @@ -333,6 +340,8 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte cell.queue_iterator = queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), cell.file_block->range().size(), cache_lock); + } else { + LOG_WARNING("Failed to change key meta").error(st); } } _key_to_time.erase(iter); @@ -637,10 +646,15 @@ void BlockFileCache::fill_holes_with_empty_file_blocks(FileBlocks& file_blocks, } FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t offset, size_t size, - const CacheContext& context) { + CacheContext& context) { FileBlock::Range range(offset, offset + size - 1); std::lock_guard cache_lock(_mutex); + if (auto iter = _key_to_time.find(hash); + context.cache_type == FileCacheType::INDEX && iter != _key_to_time.end()) { + context.cache_type = FileCacheType::TTL; + context.expiration_time = iter->second; + } /// Get all blocks which intersect with the given range. auto file_blocks = get_impl(hash, context, range, cache_lock); @@ -676,10 +690,6 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha << ".\nCurrent cache structure: " << dump_structure_unlocked(hash, cache_lock); auto& offsets = _files[hash]; - DCHECK((context.expiration_time == 0 && context.cache_type != FileCacheType::TTL) || - (context.cache_type == FileCacheType::TTL && context.expiration_time != 0)) - << fmt::format("expiration time {}, cache type {}", context.expiration_time, - context.cache_type); FileCacheKey key; key.hash = hash; @@ -687,11 +697,23 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha key.meta.type = context.cache_type; key.meta.expiration_time = context.expiration_time; FileBlockCell cell(std::make_shared(key, size, this, state), cache_lock); - if (context.cache_type != FileCacheType::TTL || config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(context.cache_type); + Status st; + if (context.expiration_time == 0 && context.cache_type == FileCacheType::TTL) { + st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + } else if (context.cache_type != FileCacheType::TTL && context.expiration_time != 0) { + st = cell.file_block->change_cache_type_by_mgr(FileCacheType::TTL); + } + if (!st.ok()) { + LOG(WARNING) << "Cannot change cache type. expiration_time=" << context.expiration_time + << " cache_type=" << cache_type_to_string(context.cache_type) + << " error=" << st.msg(); + } + if (cell.file_block->cache_type() != FileCacheType::TTL || + config::enable_ttl_cache_evict_using_lru) { + auto& queue = get_queue(cell.file_block->cache_type()); cell.queue_iterator = queue.add(hash, offset, size, cache_lock); } - if (context.cache_type == FileCacheType::TTL) { + if (cell.file_block->cache_type() == FileCacheType::TTL) { if (_key_to_time.find(hash) == _key_to_time.end()) { _key_to_time[hash] = context.expiration_time; _time_to_key.insert(std::make_pair(context.expiration_time, hash)); @@ -1000,19 +1022,18 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b } } for (auto& [_, cell] : _files[file_key]) { - if (cell.file_block->cache_type() == FileCacheType::TTL) { - auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); - if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { - ttl_queue.remove(cell.queue_iterator.value(), cache_lock); - } - auto& queue = get_queue(FileCacheType::NORMAL); - cell.queue_iterator = queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - LOG_WARNING("Failed to change cache type to normal").error(st); + if (cell.file_block->cache_type() == FileCacheType::NORMAL) continue; + auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + if (st.ok()) { + if (config::enable_ttl_cache_evict_using_lru) { + ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } + auto& queue = get_queue(FileCacheType::NORMAL); + cell.queue_iterator = + queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); + } else { + LOG_WARNING("Failed to change cache type to normal").error(st); } } } else { @@ -1403,11 +1424,72 @@ int disk_used_percentage(const std::string& path, std::pair* percent) return 0; } -void BlockFileCache::check_disk_resource_limit(const std::string& path) { +std::string BlockFileCache::reset_capacity(size_t new_capacity) { + using namespace std::chrono; + int64_t space_released = 0; + size_t old_capacity = 0; + std::stringstream ss; + ss << "finish reset_capacity, path=" << _cache_base_path; + auto start_time = steady_clock::time_point(); + { + std::lock_guard cache_lock(_mutex); + if (new_capacity < _capacity && new_capacity < _cur_cache_size) { + int64_t need_remove_size = _cur_cache_size - new_capacity; + auto remove_blocks = [&](LRUQueue& queue) -> int64_t { + int64_t queue_released = 0; + for (const auto& [entry_key, entry_offset, entry_size] : queue) { + if (need_remove_size <= 0) return queue_released; + auto* cell = get_cell(entry_key, entry_offset, cache_lock); + if (!cell->releasable()) continue; + cell->is_deleted = true; + need_remove_size -= entry_size; + space_released += entry_size; + queue_released += entry_size; + } + return queue_released; + }; + int64_t queue_released = remove_blocks(_disposable_queue); + ss << " disposable_queue released " << queue_released; + queue_released = remove_blocks(_normal_queue); + ss << " normal_queue released " << queue_released; + queue_released = remove_blocks(_index_queue); + ss << " index_queue released " << queue_released; + if (need_remove_size >= 0) { + queue_released = 0; + for (auto& [_, key] : _time_to_key) { + for (auto& [_, cell] : _files[key]) { + if (need_remove_size <= 0) break; + cell.is_deleted = true; + need_remove_size -= cell.file_block->range().size(); + space_released += cell.file_block->range().size(); + queue_released += cell.file_block->range().size(); + } + } + ss << " ttl_queue released " << queue_released; + } + _disk_resource_limit_mode = true; + _async_clear_file_cache = true; + ss << " total_space_released=" << space_released; + } + old_capacity = _capacity; + _capacity = new_capacity; + } + auto use_time = duration_cast(steady_clock::time_point() - start_time); + LOG(INFO) << "Finish tag deleted block. path=" << _cache_base_path + << " use_time=" << static_cast(use_time.count()); + ss << " old_capacity=" << old_capacity << " new_capacity=" << new_capacity; + LOG(INFO) << ss.str(); + return ss.str(); +} + +void BlockFileCache::check_disk_resource_limit() { + if (_capacity > _cur_cache_size) { + _disk_resource_limit_mode = false; + } std::pair percent; - int ret = disk_used_percentage(path, &percent); + int ret = disk_used_percentage(_cache_base_path, &percent); if (ret != 0) { - LOG_ERROR("").tag("file cache path", path).tag("error", strerror(errno)); + LOG_ERROR("").tag("file cache path", _cache_base_path).tag("error", strerror(errno)); return; } auto [capacity_percentage, inode_percentage] = percent; @@ -1447,7 +1529,7 @@ void BlockFileCache::run_background_operation() { int64_t interval_time_seconds = 20; while (!_close) { TEST_SYNC_POINT_CALLBACK("BlockFileCache::set_sleep_time", &interval_time_seconds); - check_disk_resource_limit(_cache_base_path); + check_disk_resource_limit(); { std::unique_lock close_lock(_close_mtx); _close_cv.wait_for(close_lock, std::chrono::seconds(interval_time_seconds)); @@ -1513,6 +1595,7 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, for (auto& [_, cell] : _files[hash]) { Status st = cell.file_block->update_expiration_time(new_expiration_time); if (!st.ok()) { + LOG_WARNING("Failed to modify expiration time").error(st); } } @@ -1522,12 +1605,13 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, if (auto iter = _files.find(hash); iter != _files.end()) { for (auto& [_, cell] : iter->second) { Status st = cell.file_block->update_expiration_time(new_expiration_time); - if (!st.ok() && !st.is()) { + if (!st.ok()) { LOG_WARNING("").error(st); } } for (auto& [_, cell] : iter->second) { FileCacheType origin_type = cell.file_block->cache_type(); + if (origin_type == FileCacheType::TTL) continue; auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::TTL); if (st.ok()) { auto& queue = get_queue(origin_type); diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index e7de154432eaa2..cafb57f9a1ec3c 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -86,7 +86,7 @@ class BlockFileCache { * it is guaranteed that these file blocks are not removed from cache. */ FileBlocksHolder get_or_set(const UInt128Wrapper& hash, size_t offset, size_t size, - const CacheContext& context); + CacheContext& context); /** * Clear all cached data for this cache instance async @@ -95,6 +95,14 @@ class BlockFileCache { */ std::string clear_file_cache_async(); std::string clear_file_cache_directly(); + + /** + * Reset the cache capacity. If the new_capacity is smaller than _capacity, the redundant data will be remove async. + * + * @returns summary message + */ + std::string reset_capacity(size_t new_capacity); + std::map get_blocks_by_key(const UInt128Wrapper& hash); /// For debug. std::string dump_structure(const UInt128Wrapper& hash); @@ -358,7 +366,7 @@ class BlockFileCache { size_t get_used_cache_size_unlocked(FileCacheType type, std::lock_guard& cache_lock) const; - void check_disk_resource_limit(const std::string& path); + void check_disk_resource_limit(); size_t get_available_cache_size_unlocked(FileCacheType type, std::lock_guard& cache_lock) const; diff --git a/be/src/io/cache/block_file_cache_downloader.cpp b/be/src/io/cache/block_file_cache_downloader.cpp index 9ab172fedd0b91..02e8f736828cb1 100644 --- a/be/src/io/cache/block_file_cache_downloader.cpp +++ b/be/src/io/cache/block_file_cache_downloader.cpp @@ -191,6 +191,7 @@ void FileCacheBlockDownloader::download_segment_file(const DownloadFileMeta& met FileReaderOptions opts { .cache_type = FileCachePolicy::FILE_BLOCK_CACHE, .is_doris_table = true, + .cache_base_path {}, .file_size = meta.file_size, }; auto st = meta.file_system->open_file(meta.path, &file_reader, &opts); diff --git a/be/src/io/cache/block_file_cache_factory.cpp b/be/src/io/cache/block_file_cache_factory.cpp index a6df98c686dcce..2c15d440be1aa8 100644 --- a/be/src/io/cache/block_file_cache_factory.cpp +++ b/be/src/io/cache/block_file_cache_factory.cpp @@ -83,8 +83,8 @@ Status FileCacheFactory::create_file_cache(const std::string& cache_base_path, size_t disk_capacity = static_cast( static_cast(stat.f_blocks) * static_cast(stat.f_bsize) * (static_cast(config::file_cache_enter_disk_resource_limit_mode_percent) / 100)); - if (disk_capacity < file_cache_settings.capacity) { - LOG_INFO("The cache {} config size {} is larger than {}% disk size {}, recalc it.", + if (file_cache_settings.capacity == 0 || disk_capacity < file_cache_settings.capacity) { + LOG_INFO("The cache {} config size {} is larger than {}% disk size {} or zero, recalc it.", cache_base_path, file_cache_settings.capacity, config::file_cache_enter_disk_resource_limit_mode_percent, disk_capacity); file_cache_settings = @@ -143,5 +143,20 @@ std::vector FileCacheFactory::get_base_paths() { return paths; } +std::string FileCacheFactory::reset_capacity(const std::string& path, int64_t new_capacity) { + if (path.empty()) { + std::stringstream ss; + for (auto& [_, cache] : _path_to_cache) { + ss << cache->reset_capacity(new_capacity); + } + return ss.str(); + } else { + if (auto iter = _path_to_cache.find(path); iter != _path_to_cache.end()) { + return iter->second->reset_capacity(new_capacity); + } + } + return "Unknown the cache path " + path; +} + } // namespace io } // namespace doris diff --git a/be/src/io/cache/block_file_cache_factory.h b/be/src/io/cache/block_file_cache_factory.h index 696dae6fdc5582..6365fab31057ac 100644 --- a/be/src/io/cache/block_file_cache_factory.h +++ b/be/src/io/cache/block_file_cache_factory.h @@ -70,6 +70,15 @@ class FileCacheFactory { std::vector get_base_paths(); + /** + * Clears data of all file cache instances + * + * @param path file cache absolute path + * @param new_capacity + * @return summary message + */ + std::string reset_capacity(const std::string& path, int64_t new_capacity); + FileCacheFactory() = default; FileCacheFactory& operator=(const FileCacheFactory&) = delete; FileCacheFactory(const FileCacheFactory&) = delete; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index d4b4157388aa26..0a46c98390e70f 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -224,7 +224,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* st = block->finalize(); } if (!st.ok()) { - LOG_WARNING("Write data to file cache failed").error(st); + LOG_EVERY_N(WARNING, 100) << "Write data to file cache failed. err=" << st.msg(); } else { _insert_file_reader(block); } diff --git a/be/src/io/cache/file_block.cpp b/be/src/io/cache/file_block.cpp index 5985aa95f7abdc..6586dcf589bdde 100644 --- a/be/src/io/cache/file_block.cpp +++ b/be/src/io/cache/file_block.cpp @@ -25,6 +25,7 @@ #include #include "common/status.h" +#include "cpp/sync_point.h" #include "io/cache/block_file_cache.h" namespace doris { @@ -162,14 +163,14 @@ Status FileBlock::read(Slice buffer, size_t read_offset) { Status FileBlock::change_cache_type_by_mgr(FileCacheType new_type) { std::lock_guard block_lock(_mutex); - if (new_type == _key.meta.type) { - return Status::OK(); - } + DCHECK(new_type != _key.meta.type); if (_download_state == State::DOWNLOADED) { KeyMeta new_meta; new_meta.expiration_time = _key.meta.expiration_time; new_meta.type = new_type; - RETURN_IF_ERROR(_mgr->_storage->change_key_meta(_key, new_meta)); + auto st = _mgr->_storage->change_key_meta(_key, new_meta); + TEST_SYNC_POINT_CALLBACK("FileBlock::change_cache_type", &st); + if (!st.ok()) return st; } _key.meta.type = new_type; return Status::OK(); @@ -198,7 +199,10 @@ Status FileBlock::update_expiration_time(uint64_t expiration_time) { KeyMeta new_meta; new_meta.expiration_time = expiration_time; new_meta.type = _key.meta.type; - RETURN_IF_ERROR(_mgr->_storage->change_key_meta(_key, new_meta)); + auto st = _mgr->_storage->change_key_meta(_key, new_meta); + if (!st.ok() && !st.is()) { + return st; + } } _key.meta.expiration_time = expiration_time; return Status::OK(); diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 3ce647b4a0d704..61e873e04c6cbe 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -30,6 +30,7 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach size_t normal_percent, size_t disposable_percent, size_t index_percent) { io::FileCacheSettings settings; + if (capacity == 0) return settings; settings.capacity = capacity; settings.max_file_block_size = config::file_cache_each_block_size; settings.max_query_cache_size = max_query_cache_size; diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 431f703129fe8d..34e62d6fe6f672 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -171,6 +171,7 @@ Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Sl Status FSFileCacheStorage::remove(const FileCacheKey& key) { std::string dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); std::string file = get_path_in_local_cache(dir, key.offset, key.meta.type); + FDCache::instance()->remove_file_reader(std::make_pair(key.hash, key.offset)); RETURN_IF_ERROR(fs->delete_file(file)); std::vector files; bool exists {false}; @@ -179,7 +180,6 @@ Status FSFileCacheStorage::remove(const FileCacheKey& key) { if (files.empty()) { RETURN_IF_ERROR(fs->delete_directory(dir)); } - FDCache::instance()->remove_file_reader(std::make_pair(key.hash, key.offset)); return Status::OK(); } diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index 0c84c2eb74c7d8..7f64ea50710268 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -55,7 +55,11 @@ constexpr std::string_view RANDOM_CACHE_BASE_PATH = "random"; io::FileReaderOptions FileFactory::get_reader_options(RuntimeState* state, const io::FileDescription& fd) { - io::FileReaderOptions opts {.file_size = fd.file_size, .mtime = fd.mtime}; + io::FileReaderOptions opts { + .cache_base_path {}, + .file_size = fd.file_size, + .mtime = fd.mtime, + }; if (config::enable_file_cache && state != nullptr && state->query_options().__isset.enable_file_cache && state->query_options().enable_file_cache) { diff --git a/be/src/io/fs/azure_obj_storage_client.cpp b/be/src/io/fs/azure_obj_storage_client.cpp index 043886672a2af3..9f33db3400acdc 100644 --- a/be/src/io/fs/azure_obj_storage_client.cpp +++ b/be/src/io/fs/azure_obj_storage_client.cpp @@ -42,6 +42,7 @@ #include "common/logging.h" #include "common/status.h" #include "io/fs/obj_storage_client.h" +#include "util/bvar_helper.h" #include "util/s3_util.h" using namespace Azure::Storage::Blobs; @@ -57,6 +58,28 @@ auto base64_encode_part_num(int part_num) { {reinterpret_cast(&part_num), sizeof(part_num)}); } +template +auto s3_rate_limit(doris::S3RateLimitType op, Func callback) -> decltype(callback()) { + if (!doris::config::enable_s3_rate_limiter) { + return callback(); + } + auto sleep_duration = doris::S3ClientFactory::instance().rate_limiter(op)->add(1); + if (sleep_duration < 0) { + throw std::runtime_error("Azure exceeds request limit"); + } + return callback(); +} + +template +auto s3_get_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::GET, std::move(callback)); +} + +template +auto s3_put_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::PUT, std::move(callback)); +} + constexpr char SAS_TOKEN_URL_TEMPLATE[] = "https://{}.blob.core.windows.net/{}/{}{}"; constexpr char BlobNotFound[] = "BlobNotFound"; } // namespace @@ -101,7 +124,14 @@ struct AzureBatchDeleter { if (deferred_resps.empty()) { return ObjectStorageResponse::OK(); } - auto resp = do_azure_client_call([&]() { _client->SubmitBatch(_batch); }, _opts); + auto resp = do_azure_client_call( + [&]() { + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + _client->SubmitBatch(_batch); + }); + }, + _opts); if (resp.status.code != ErrorCode::OK) { return resp; } @@ -156,7 +186,11 @@ ObjectStorageResponse AzureObjStorageClient::put_object(const ObjectStoragePathO auto client = _client->GetBlockBlobClient(opts.key); return do_azure_client_call( [&]() { - client.UploadFrom(reinterpret_cast(stream.data()), stream.size()); + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_put_latency); + client.UploadFrom(reinterpret_cast(stream.data()), + stream.size()); + }); }, opts); } @@ -169,7 +203,10 @@ ObjectStorageUploadResponse AzureObjStorageClient::upload_part(const ObjectStora Azure::Core::IO::MemoryBodyStream memory_body( reinterpret_cast(stream.data()), stream.size()); // The blockId must be base64 encoded - client.StageBlock(base64_encode_part_num(part_num), memory_body); + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); + client.StageBlock(base64_encode_part_num(part_num), memory_body); + }); } catch (Azure::Core::RequestFailedException& e) { auto msg = fmt::format( "Azure request failed because {}, error msg {}, http code {}, path msg {}", @@ -200,13 +237,22 @@ ObjectStorageResponse AzureObjStorageClient::complete_multipart_upload( std::ranges::transform( completed_parts, std::back_inserter(string_block_ids), [](const ObjectCompleteMultiPart& i) { return base64_encode_part_num(i.part_num); }); - return do_azure_client_call([&]() { client.CommitBlockList(string_block_ids); }, opts); + return do_azure_client_call( + [&]() { + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); + client.CommitBlockList(string_block_ids); + }); + }, + opts); } ObjectStorageHeadResponse AzureObjStorageClient::head_object(const ObjectStoragePathOptions& opts) { try { - Models::BlobProperties properties = - _client->GetBlockBlobClient(opts.key).GetProperties().Value; + Models::BlobProperties properties = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_head_latency); + return _client->GetBlockBlobClient(opts.key).GetProperties().Value; + }); return {.file_size = properties.BlobSize}; } catch (Azure::Core::RequestFailedException& e) { if (e.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { @@ -238,8 +284,11 @@ ObjectStorageResponse AzureObjStorageClient::get_object(const ObjectStoragePathO DownloadBlobToOptions download_opts; Azure::Core::Http::HttpRange range {static_cast(offset), bytes_read}; download_opts.Range = range; - auto resp = client.DownloadTo(reinterpret_cast(buffer), bytes_read, - download_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); + return client.DownloadTo(reinterpret_cast(buffer), bytes_read, + download_opts); + }); *size_return = resp.Value.ContentRange.Length.Value(); }, opts); @@ -257,11 +306,17 @@ ObjectStorageResponse AzureObjStorageClient::list_objects(const ObjectStoragePat [&]() { ListBlobsOptions list_opts; list_opts.Prefix = opts.prefix; - auto resp = _client->ListBlobs(list_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); get_file_file(resp); while (!resp.NextPageToken->empty()) { list_opts.ContinuationToken = resp.NextPageToken; - resp = _client->ListBlobs(list_opts); + resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); get_file_file(resp); } }, @@ -297,7 +352,10 @@ ObjectStorageResponse AzureObjStorageClient::delete_objects(const ObjectStorageP ObjectStorageResponse AzureObjStorageClient::delete_object(const ObjectStoragePathOptions& opts) { return do_azure_client_call( [&]() { - auto resp = _client->DeleteBlob(opts.key); + auto resp = s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_object_latency); + return _client->DeleteBlob(opts.key); + }); if (!resp.Value.Deleted) { throw Exception(Status::IOError("Delete azure blob failed")); } @@ -321,14 +379,20 @@ ObjectStorageResponse AzureObjStorageClient::delete_objects_recursively( } return ObjectStorageResponse::OK(); }; - auto resp = _client->ListBlobs(list_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); if (auto response = delete_func(resp.Blobs); response.status.code != ErrorCode::OK) { return response; } while (!resp.NextPageToken->empty()) { list_opts.ContinuationToken = resp.NextPageToken; - resp = _client->ListBlobs(list_opts); + resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); if (auto response = delete_func(resp.Blobs); response.status.code != ErrorCode::OK) { return response; diff --git a/be/src/io/fs/file_reader.cpp b/be/src/io/fs/file_reader.cpp index 050c81950ed323..966df6ec7ee33b 100644 --- a/be/src/io/fs/file_reader.cpp +++ b/be/src/io/fs/file_reader.cpp @@ -22,14 +22,28 @@ #include "io/cache/cached_remote_file_reader.h" #include "io/fs/file_system.h" +#include "runtime/thread_context.h" +#include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" namespace doris::io { +const std::string FileReader::VIRTUAL_REMOTE_DATA_DIR = "virtual_remote_data_dir"; + Status FileReader::read_at(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) { DCHECK(bthread_self() == 0); + std::shared_ptr iot = nullptr; + if (auto* t_ctx = doris::thread_context(true)) { + iot = t_ctx->io_throttle(get_data_dir_path()); + } + if (iot) { + iot->acquire(-1); + } Status st = read_at_impl(offset, result, bytes_read, io_ctx); + if (iot) { + iot->update_next_io_time(*bytes_read); + } if (!st) { LOG(WARNING) << st; } diff --git a/be/src/io/fs/file_reader.h b/be/src/io/fs/file_reader.h index a6c0f897d4367c..79efa500c0677f 100644 --- a/be/src/io/fs/file_reader.h +++ b/be/src/io/fs/file_reader.h @@ -71,6 +71,8 @@ class FileReader : public doris::ProfileCollector { FileReader(const FileReader&) = delete; const FileReader& operator=(const FileReader&) = delete; + static const std::string VIRTUAL_REMOTE_DATA_DIR; + /// If io_ctx is not null, /// the caller must ensure that the IOContext exists during the left cycle of read_at() Status read_at(size_t offset, Slice result, size_t* bytes_read, @@ -84,6 +86,8 @@ class FileReader : public doris::ProfileCollector { virtual bool closed() const = 0; + virtual const std::string& get_data_dir_path() { return VIRTUAL_REMOTE_DATA_DIR; } + protected: virtual Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) = 0; diff --git a/be/src/io/fs/hdfs_file_writer.cpp b/be/src/io/fs/hdfs_file_writer.cpp index ceff2cc429a7d5..ff68d1c837ae92 100644 --- a/be/src/io/fs/hdfs_file_writer.cpp +++ b/be/src/io/fs/hdfs_file_writer.cpp @@ -50,6 +50,9 @@ bvar::Adder hdfs_file_writer_total("hdfs_file_writer_total_num"); bvar::Adder hdfs_bytes_written_total("hdfs_file_writer_bytes_written"); bvar::Adder hdfs_file_created_total("hdfs_file_writer_file_created"); bvar::Adder inflight_hdfs_file_writer("inflight_hdfs_file_writer"); +bvar::Adder hdfs_file_writer_async_close_queuing("hdfs_file_writer_async_close_queuing"); +bvar::Adder hdfs_file_writer_async_close_processing( + "hdfs_file_writer_async_close_processing"); static constexpr size_t MB = 1024 * 1024; #ifndef USE_LIBHDFS3 @@ -122,7 +125,11 @@ class HdfsWriteMemUsageRecorder { } private: - size_t max_jvm_heap_size() const { return JniUtil::get_max_jni_heap_memory_size(); } + // clang-format off + size_t max_jvm_heap_size() const { + return JniUtil::get_max_jni_heap_memory_size(); + } + // clang-format on [[maybe_unused]] std::size_t cur_memory_comsuption {0}; std::mutex cur_memory_latch; std::condition_variable cv; @@ -230,8 +237,13 @@ Status HdfsFileWriter::close(bool non_block) { _state = State::ASYNC_CLOSING; _async_close_pack = std::make_unique(); _async_close_pack->future = _async_close_pack->promise.get_future(); - return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func( - [&]() { _async_close_pack->promise.set_value(_close_impl()); }); + hdfs_file_writer_async_close_queuing << 1; + return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func([&]() { + hdfs_file_writer_async_close_queuing << -1; + hdfs_file_writer_async_close_processing << 1; + _async_close_pack->promise.set_value(_close_impl()); + hdfs_file_writer_async_close_processing << -1; + }); } _st = _close_impl(); _state = State::CLOSED; diff --git a/be/src/io/fs/local_file_reader.cpp b/be/src/io/fs/local_file_reader.cpp index a1902b687bd6da..17937bcbd6f41c 100644 --- a/be/src/io/fs/local_file_reader.cpp +++ b/be/src/io/fs/local_file_reader.cpp @@ -34,14 +34,52 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "cpp/sync_point.h" #include "io/fs/err_utils.h" +#include "olap/olap_common.h" +#include "olap/options.h" #include "util/async_io.h" #include "util/doris_metrics.h" namespace doris { namespace io { +std::vector BeConfDataDirReader::be_config_data_dir_list; + +void BeConfDataDirReader::get_data_dir_by_file_path(io::Path* file_path, + std::string* data_dir_arg) { + for (const auto& data_dir_info : be_config_data_dir_list) { + if (data_dir_info.path.size() >= file_path->string().size()) { + continue; + } + if (file_path->string().compare(0, data_dir_info.path.size(), data_dir_info.path) == 0) { + *data_dir_arg = data_dir_info.path; + } + } +} + +void BeConfDataDirReader::init_be_conf_data_dir( + const std::vector& store_paths, + const std::vector& spill_store_paths) { + for (int i = 0; i < store_paths.size(); i++) { + DataDirInfo data_dir_info; + data_dir_info.path = store_paths[i].path; + data_dir_info.storage_medium = store_paths[i].storage_medium; + data_dir_info.data_dir_type = DataDirType::OLAP_DATA_DIR; + be_config_data_dir_list.push_back(data_dir_info); + } + + for (int i = 0; i < spill_store_paths.size(); i++) { + doris::DataDirInfo data_dir_info; + data_dir_info.path = spill_store_paths[i].path; + data_dir_info.storage_medium = spill_store_paths[i].storage_medium; + data_dir_info.data_dir_type = doris::DataDirType::SPILL_DISK_DIR; + be_config_data_dir_list.push_back(data_dir_info); + } +} + LocalFileReader::LocalFileReader(Path path, size_t file_size, int fd) : _fd(fd), _path(std::move(path)), _file_size(file_size) { + _data_dir_path = ""; + BeConfDataDirReader::get_data_dir_by_file_path(&_path, &_data_dir_path); DorisMetrics::instance()->local_file_open_reading->increment(1); DorisMetrics::instance()->local_file_reader_total->increment(1); } diff --git a/be/src/io/fs/local_file_reader.h b/be/src/io/fs/local_file_reader.h index 7e327ba6606723..4191374111b8af 100644 --- a/be/src/io/fs/local_file_reader.h +++ b/be/src/io/fs/local_file_reader.h @@ -26,7 +26,22 @@ #include "io/fs/path.h" #include "util/slice.h" +namespace doris { +struct StorePath; +struct DataDirInfo; +} // namespace doris + namespace doris::io { + +struct BeConfDataDirReader { + static std::vector be_config_data_dir_list; + + static void get_data_dir_by_file_path(Path* file_path, std::string* data_dir_arg); + + static void init_be_conf_data_dir(const std::vector& store_paths, + const std::vector& spill_store_paths); +}; + struct IOContext; class LocalFileReader final : public FileReader { @@ -43,6 +58,8 @@ class LocalFileReader final : public FileReader { bool closed() const override { return _closed.load(std::memory_order_acquire); } + const std::string& get_data_dir_path() override { return _data_dir_path; } + private: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; @@ -52,6 +69,7 @@ class LocalFileReader final : public FileReader { Path _path; size_t _file_size; std::atomic _closed = false; + std::string _data_dir_path; // be conf's data dir path }; } // namespace doris::io diff --git a/be/src/io/fs/multi_table_pipe.cpp b/be/src/io/fs/multi_table_pipe.cpp index 789903cc1ca86b..d7fdd8a738b274 100644 --- a/be/src/io/fs/multi_table_pipe.cpp +++ b/be/src/io/fs/multi_table_pipe.cpp @@ -326,6 +326,19 @@ void MultiTablePipe::_handle_consumer_finished() { _ctx->number_filtered_rows = _number_filtered_rows; _ctx->number_unselected_rows = _number_unselected_rows; _ctx->commit_infos = _tablet_commit_infos; + + // remove ctx to avoid memory leak. + for (const auto& pair : _planned_tables) { + if (pair.second) { + doris::ExecEnv::GetInstance()->new_load_stream_mgr()->remove(pair.second->id); + } + } + for (const auto& pair : _unplanned_tables) { + if (pair.second) { + doris::ExecEnv::GetInstance()->new_load_stream_mgr()->remove(pair.second->id); + } + } + LOG(INFO) << "all plan for multi-table load complete. number_total_rows=" << _ctx->number_total_rows << " number_loaded_rows=" << _ctx->number_loaded_rows << " number_filtered_rows=" << _ctx->number_filtered_rows diff --git a/be/src/io/fs/s3_file_reader.cpp b/be/src/io/fs/s3_file_reader.cpp index a5c6ec09162cf4..ab9033e586d41d 100644 --- a/be/src/io/fs/s3_file_reader.cpp +++ b/be/src/io/fs/s3_file_reader.cpp @@ -120,18 +120,6 @@ Status S3FileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_rea if (!client) { return Status::InternalError("init s3 client error"); } - // // clang-format off - // auto resp = client->get_object( { .bucket = _bucket, .key = _key, }, - // to, offset, bytes_req, bytes_read); - // // clang-format on - // if (resp.status.code != ErrorCode::OK) { - // return std::move(Status(resp.status.code, std::move(resp.status.msg)) - // .append(fmt::format("failed to read from {}", _path.native()))); - // } - // if (*bytes_read != bytes_req) { - // return Status::InternalError("failed to read from {}(bytes read: {}, bytes req: {})", - // _path.native(), *bytes_read, bytes_req); - SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); int retry_count = 0; const int base_wait_time = config::s3_read_base_wait_time_ms; // Base wait time in milliseconds diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 93f36429485c32..3905c4ddb1eed7 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -18,9 +18,8 @@ #include "io/fs/s3_file_system.h" #include -#include -#include +#include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include @@ -32,7 +31,6 @@ #include // IWYU pragma: keep #include #include -#include #include "common/config.h" #include "common/logging.h" @@ -46,7 +44,7 @@ #include "io/fs/s3_file_reader.h" #include "io/fs/s3_file_writer.h" #include "io/fs/s3_obj_storage_client.h" -#include "util/bvar_helper.h" +#include "runtime/exec_env.h" #include "util/s3_uri.h" #include "util/s3_util.h" @@ -69,13 +67,6 @@ Result get_key(const Path& full_path) { return uri.get_key(); } -// TODO(plat1ko): AwsTransferManager will be deprecated -std::shared_ptr& default_executor() { - static auto executor = Aws::MakeShared( - "default", config::s3_transfer_executor_pool_size); - return executor; -} - } // namespace ObjClientHolder::ObjClientHolder(S3ClientConf conf) : _conf(std::move(conf)) {} @@ -383,13 +374,19 @@ Status S3FileSystem::batch_upload_impl(const std::vector& local_files, return Status::OK(); }; + Status s = Status::OK(); std::vector> futures; for (int i = 0; i < local_files.size(); ++i) { auto task = std::make_shared>(upload_task); futures.emplace_back(task->get_future()); - default_executor()->Submit([t = std::move(task), idx = i]() mutable { (*t)(idx); }); + auto st = ExecEnv::GetInstance()->s3_file_system_thread_pool()->submit_func( + [t = std::move(task), idx = i]() mutable { (*t)(idx); }); + // We shouldn't return immediately since the previous submitted tasks might still be running in the thread pool + if (!st.ok()) { + s = st; + break; + } } - Status s = Status::OK(); for (auto&& f : futures) { auto cur_s = f.get(); if (!cur_s.ok()) { diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index 655a94a6eb4c0d..a9fb1c96fda180 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -43,10 +43,13 @@ namespace doris::io { -bvar::Adder s3_file_writer_total("s3_file_writer", "total_num"); -bvar::Adder s3_bytes_written_total("s3_file_writer", "bytes_written"); -bvar::Adder s3_file_created_total("s3_file_writer", "file_created"); -bvar::Adder s3_file_being_written("s3_file_writer", "file_being_written"); +bvar::Adder s3_file_writer_total("s3_file_writer_total_num"); +bvar::Adder s3_bytes_written_total("s3_file_writer_bytes_written"); +bvar::Adder s3_file_created_total("s3_file_writer_file_created"); +bvar::Adder s3_file_being_written("s3_file_writer_file_being_written"); +bvar::Adder s3_file_writer_async_close_queuing("s3_file_writer_async_close_queuing"); +bvar::Adder s3_file_writer_async_close_processing( + "s3_file_writer_async_close_processing"); S3FileWriter::S3FileWriter(std::shared_ptr client, std::string bucket, std::string key, const FileWriterOptions* opts) @@ -141,8 +144,13 @@ Status S3FileWriter::close(bool non_block) { _state = State::ASYNC_CLOSING; _async_close_pack = std::make_unique(); _async_close_pack->future = _async_close_pack->promise.get_future(); - return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func( - [&]() { _async_close_pack->promise.set_value(_close_impl()); }); + s3_file_writer_async_close_queuing << 1; + return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func([&]() { + s3_file_writer_async_close_queuing << -1; + s3_file_writer_async_close_processing << 1; + _async_close_pack->promise.set_value(_close_impl()); + s3_file_writer_async_close_processing << -1; + }); } _st = _close_impl(); _state = State::CLOSED; diff --git a/be/src/io/fs/s3_obj_storage_client.cpp b/be/src/io/fs/s3_obj_storage_client.cpp index 2bed3241e30278..2c66e819833b94 100644 --- a/be/src/io/fs/s3_obj_storage_client.cpp +++ b/be/src/io/fs/s3_obj_storage_client.cpp @@ -71,6 +71,35 @@ #include "io/fs/s3_common.h" #include "util/bvar_helper.h" +namespace { +inline ::Aws::Client::AWSError<::Aws::S3::S3Errors> s3_error_factory() { + return {::Aws::S3::S3Errors::INTERNAL_FAILURE, "exceeds limit", "exceeds limit", false}; +} + +template +auto s3_rate_limit(doris::S3RateLimitType op, Func callback) -> decltype(callback()) { + using T = decltype(callback()); + if (!doris::config::enable_s3_rate_limiter) { + return callback(); + } + auto sleep_duration = doris::S3ClientFactory::instance().rate_limiter(op)->add(1); + if (sleep_duration < 0) { + return T(s3_error_factory()); + } + return callback(); +} + +template +auto s3_get_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::GET, std::move(callback)); +} + +template +auto s3_put_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::PUT, std::move(callback)); +} +} // namespace + namespace Aws::S3::Model { class DeleteObjectRequest; } // namespace Aws::S3::Model @@ -92,9 +121,9 @@ ObjectStorageUploadResponse S3ObjStorageClient::create_multipart_upload( create_request.SetContentType("application/octet-stream"); SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); - auto outcome = SYNC_POINT_HOOK_RETURN_VALUE(_client->CreateMultipartUpload(create_request), - "s3_file_writer::create_multi_part_upload", - std::cref(create_request).get()); + auto outcome = SYNC_POINT_HOOK_RETURN_VALUE( + s3_put_rate_limit([&]() { return _client->CreateMultipartUpload(create_request); }), + "s3_file_writer::create_multi_part_upload", std::cref(create_request).get()); SYNC_POINT_CALLBACK("s3_file_writer::_open", &outcome); if (outcome.IsSuccess()) { @@ -122,9 +151,9 @@ ObjectStorageResponse S3ObjStorageClient::put_object(const ObjectStoragePathOpti request.SetContentLength(stream.size()); request.SetContentType("application/octet-stream"); SCOPED_BVAR_LATENCY(s3_bvar::s3_put_latency); - auto response = - SYNC_POINT_HOOK_RETURN_VALUE(_client->PutObject(request), "s3_file_writer::put_object", - std::cref(request).get(), &stream); + auto response = SYNC_POINT_HOOK_RETURN_VALUE( + s3_put_rate_limit([&]() { return _client->PutObject(request); }), + "s3_file_writer::put_object", std::cref(request).get(), &stream); if (!response.IsSuccess()) { auto st = s3fs_error(response.GetError(), fmt::format("failed to put object {}", opts.path.native())); @@ -157,8 +186,8 @@ ObjectStorageUploadResponse S3ObjStorageClient::upload_part(const ObjectStorageP { SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); upload_part_outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->UploadPart(upload_request), "s3_file_writer::upload_part", - std::cref(upload_request).get(), &stream); + s3_put_rate_limit([&]() { return _client->UploadPart(upload_request); }), + "s3_file_writer::upload_part", std::cref(upload_request).get(), &stream); } TEST_SYNC_POINT_CALLBACK("S3FileWriter::_upload_one_part", &upload_part_outcome); if (!upload_part_outcome.IsSuccess()) { @@ -199,7 +228,7 @@ ObjectStorageResponse S3ObjStorageClient::complete_multipart_upload( TEST_SYNC_POINT_RETURN_WITH_VALUE("S3FileWriter::_complete:3", ObjectStorageResponse(), this); SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); auto complete_outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->CompleteMultipartUpload(complete_request), + s3_put_rate_limit([&]() { return _client->CompleteMultipartUpload(complete_request); }), "s3_file_writer::complete_multi_part", std::cref(complete_request).get()); if (!complete_outcome.IsSuccess()) { @@ -220,7 +249,8 @@ ObjectStorageHeadResponse S3ObjStorageClient::head_object(const ObjectStoragePat SCOPED_BVAR_LATENCY(s3_bvar::s3_head_latency); auto outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->HeadObject(request), "s3_file_system::head_object", std::ref(request).get()); + s3_get_rate_limit([&]() { return _client->HeadObject(request); }), + "s3_file_system::head_object", std::ref(request).get()); if (outcome.IsSuccess()) { return {.resp = {convert_to_obj_response(Status::OK())}, .file_size = outcome.GetResult().GetContentLength()}; @@ -247,7 +277,7 @@ ObjectStorageResponse S3ObjStorageClient::get_object(const ObjectStoragePathOpti request.SetResponseStreamFactory(AwsWriteableStreamFactory(buffer, bytes_read)); SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); - auto outcome = _client->GetObject(request); + auto outcome = s3_get_rate_limit([&]() { return _client->GetObject(request); }); if (!outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(outcome.GetError(), @@ -273,7 +303,7 @@ ObjectStorageResponse S3ObjStorageClient::list_objects(const ObjectStoragePathOp Aws::S3::Model::ListObjectsV2Outcome outcome; { SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); - outcome = _client->ListObjectsV2(request); + outcome = s3_get_rate_limit([&]() { return _client->ListObjectsV2(request); }); } if (!outcome.IsSuccess()) { files->clear(); @@ -310,8 +340,9 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects(const ObjectStoragePath }); del.WithObjects(std::move(objects)).SetQuiet(true); delete_request.SetDelete(std::move(del)); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto delete_outcome = _client->DeleteObjects(delete_request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + auto delete_outcome = + s3_put_rate_limit([&]() { return _client->DeleteObjects(delete_request); }); if (!delete_outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(delete_outcome.GetError(), @@ -331,8 +362,8 @@ ObjectStorageResponse S3ObjStorageClient::delete_object(const ObjectStoragePathO Aws::S3::Model::DeleteObjectRequest request; request.WithBucket(opts.bucket).WithKey(opts.key); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto outcome = _client->DeleteObject(request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_object_latency); + auto outcome = s3_put_rate_limit([&]() { return _client->DeleteObject(request); }); if (outcome.IsSuccess() || outcome.GetError().GetResponseCode() == Aws::Http::HttpResponseCode::NOT_FOUND) { return ObjectStorageResponse::OK(); @@ -354,7 +385,7 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects_recursively( Aws::S3::Model::ListObjectsV2Outcome outcome; { SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); - outcome = _client->ListObjectsV2(request); + outcome = s3_get_rate_limit([&]() { return _client->ListObjectsV2(request); }); } if (!outcome.IsSuccess()) { return {convert_to_obj_response(s3fs_error( @@ -373,8 +404,9 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects_recursively( Aws::S3::Model::Delete del; del.WithObjects(std::move(objects)).SetQuiet(true); delete_request.SetDelete(std::move(del)); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto delete_outcome = _client->DeleteObjects(delete_request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + auto delete_outcome = + s3_put_rate_limit([&]() { return _client->DeleteObjects(delete_request); }); if (!delete_outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(delete_outcome.GetError(), diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index a396f4750b202d..141e302af8c420 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -33,6 +33,7 @@ #include "olap/txn_manager.h" #include "service/point_query_executor.h" #include "util/bvar_helper.h" +#include "util/crc32c.h" #include "util/debug_points.h" #include "util/doris_metrics.h" #include "vec/common/schema_util.h" @@ -830,7 +831,7 @@ Status BaseTablet::sort_block(vectorized::Block& in_block, vectorized::Block& ou vectorized::MutableBlock::build_mutable_block(&output_block); std::shared_ptr vec_row_comparator = - std::make_shared(_tablet_meta->tablet_schema().get()); + std::make_shared(_tablet_meta->tablet_schema()); vec_row_comparator->set_block(&mutable_input_block); std::vector> row_in_blocks; @@ -1200,7 +1201,9 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf std::unique_ptr transient_rs_writer; DeleteBitmapPtr delete_bitmap = txn_info->delete_bitmap; - if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update) { + bool is_partial_update = + txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update; + if (is_partial_update) { transient_rs_writer = DORIS_TRY(self->create_transient_rowset_writer( *rowset, txn_info->partial_update_info, txn_expiration)); // Partial update might generate new segments when there is conflicts while publish, and mark @@ -1241,6 +1244,37 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf } auto t3 = watch.get_elapse_time_us(); + // If a rowset is produced by compaction before the commit phase of the partial update load + // and is not included in txn_info->rowset_ids, we can skip the alignment process of that rowset + // because data remains the same before and after compaction. But we still need to calculate the + // the delete bitmap for that rowset. + std::vector rowsets_skip_alignment; + if (is_partial_update) { + int64_t max_version_in_flush_phase = + txn_info->partial_update_info->max_version_in_flush_phase; + DCHECK(max_version_in_flush_phase != -1); + std::vector remained_rowsets; + for (const auto& rowset : specified_rowsets) { + if (rowset->end_version() <= max_version_in_flush_phase && + rowset->produced_by_compaction()) { + rowsets_skip_alignment.emplace_back(rowset); + } else { + remained_rowsets.emplace_back(rowset); + } + } + if (!rowsets_skip_alignment.empty()) { + specified_rowsets = std::move(remained_rowsets); + } + } + + if (!rowsets_skip_alignment.empty()) { + auto token = self->calc_delete_bitmap_executor()->create_token(); + // set rowset_writer to nullptr to skip the alignment process + RETURN_IF_ERROR(calc_delete_bitmap(self, rowset, segments, rowsets_skip_alignment, + delete_bitmap, cur_version - 1, token.get(), nullptr)); + RETURN_IF_ERROR(token->wait()); + } + // When there is only one segment, it will be calculated in the current thread. // Otherwise, it will be submitted to the thread pool for calculation. if (segments.size() <= 1) { @@ -1432,7 +1466,8 @@ Status BaseTablet::update_delete_bitmap_without_lock( return Status::InternalError( "debug tablet update delete bitmap without lock random failed"); } else { - LOG(INFO) << "BaseTablet.update_delete_bitmap_without_lock.random_failed not triggered" + LOG(INFO) << "BaseTablet.update_delete_bitmap_without_lock.random_failed not " + "triggered" << ", rnd:" << rnd << ", percent: " << percent; } }); @@ -1555,4 +1590,71 @@ void BaseTablet::calc_consecutive_empty_rowsets( } } +Status BaseTablet::calc_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, + int32_t* rowset_count, int64_t* file_count) { + Version v(start_version, end_version); + std::vector rowsets; + traverse_rowsets([&rowsets, &v](const auto& rs) { + // get all rowsets + if (v.contains(rs->version())) { + rowsets.emplace_back(rs); + } + }); + std::sort(rowsets.begin(), rowsets.end(), Rowset::comparator); + *rowset_count = rowsets.size(); + + *crc_value = 0; + *file_count = 0; + for (const auto& rs : rowsets) { + uint32_t rs_crc_value = 0; + int64_t rs_file_count = 0; + auto rowset = std::static_pointer_cast(rs); + auto st = rowset->calc_file_crc(&rs_crc_value, &rs_file_count); + if (!st.ok()) { + return st; + } + // crc_value is calculated based on the crc_value of each rowset. + *crc_value = crc32c::Extend(*crc_value, reinterpret_cast(&rs_crc_value), + sizeof(rs_crc_value)); + *file_count += rs_file_count; + } + return Status::OK(); +} + +Status BaseTablet::show_nested_index_file(std::string* json_meta) { + Version v(0, max_version_unlocked()); + std::vector rowsets; + traverse_rowsets([&rowsets, &v](const auto& rs) { + // get all rowsets + if (v.contains(rs->version())) { + rowsets.emplace_back(rs); + } + }); + std::sort(rowsets.begin(), rowsets.end(), Rowset::comparator); + + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + rapidjson::Value tabletIdValue(tablet_id()); + doc.AddMember("tablet_id", tabletIdValue, allocator); + + rapidjson::Value rowsets_value(rapidjson::kArrayType); + + for (const auto& rs : rowsets) { + rapidjson::Value rowset_value(rapidjson::kObjectType); + + auto rowset = std::static_pointer_cast(rs); + RETURN_IF_ERROR(rowset->show_nested_index_file(&rowset_value, allocator)); + rowsets_value.PushBack(rowset_value, allocator); + } + doc.AddMember("rowsets", rowsets_value, allocator); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + doc.Accept(writer); + *json_meta = std::string(buffer.GetString()); + + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index f625ecf4a0a98e..f958d398fd5d00 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -28,6 +28,7 @@ #include "olap/rowset/segment_v2/segment.h" #include "olap/tablet_fwd.h" #include "olap/tablet_meta.h" +#include "olap/tablet_schema.h" #include "olap/version_graph.h" #include "util/metrics.h" @@ -252,6 +253,26 @@ class BaseTablet { const std::vector& candidate_rowsets, int limit); + // Return the merged schema of all rowsets + virtual TabletSchemaSPtr merged_tablet_schema() const { return _max_version_schema; } + + void traverse_rowsets(std::function visitor, + bool include_stale = false) { + std::shared_lock rlock(_meta_lock); + for (auto& [v, rs] : _rs_version_map) { + visitor(rs); + } + if (!include_stale) return; + for (auto& [v, rs] : _stale_rs_version_map) { + visitor(rs); + } + } + + Status calc_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, + int32_t* rowset_count, int64_t* file_count); + + Status show_nested_index_file(std::string* json_meta); + protected: // Find the missed versions until the spec_version. // diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index eb807d2a4abaa0..8c109eec1c1c4d 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -103,7 +103,7 @@ bool is_rowset_tidy(std::string& pre_max_key, const RowsetSharedPtr& rhs) { if (!ret) { return false; } - if (min_key < pre_max_key) { + if (min_key <= pre_max_key) { return false; } CHECK(rhs->max_key(&pre_max_key)); @@ -197,9 +197,6 @@ Status Compaction::merge_input_rowsets() { _tablet->last_compaction_status = res; if (!res.ok()) { - LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res - << ", tablet=" << _tablet->tablet_id() - << ", output_version=" << _output_version; return res; } @@ -352,7 +349,9 @@ bool CompactionMixin::handle_ordered_data_compaction() { // check delete version: if compaction type is base compaction and // has a delete version, use original compaction - if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { + if (compaction_type() == ReaderType::READER_BASE_COMPACTION || + (_allow_delete_in_cumu_compaction && + compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION)) { for (auto& rowset : _input_rowsets) { if (rowset->rowset_meta()->has_delete_predicate()) { return false; @@ -400,15 +399,14 @@ Status CompactionMixin::execute_compact() { data_dir->disks_compaction_score_increment(permits); data_dir->disks_compaction_num_increment(1); - Status st = execute_compact_impl(permits); - _tablet->compaction_count.fetch_add(1, std::memory_order_relaxed); - - data_dir->disks_compaction_score_increment(-permits); - data_dir->disks_compaction_num_increment(-1); + auto record_compaction_stats = [&](const doris::Exception& ex) { + _tablet->compaction_count.fetch_add(1, std::memory_order_relaxed); + data_dir->disks_compaction_score_increment(-permits); + data_dir->disks_compaction_num_increment(-1); + }; - if (!st.ok()) { - return st; - } + HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(execute_compact_impl(permits), record_compaction_stats); + record_compaction_stats(doris::Exception()); if (enable_compaction_checksum) { EngineChecksumTask checksum_task(_engine, _tablet->tablet_id(), _tablet->schema_hash(), @@ -511,8 +509,8 @@ Status Compaction::do_inverted_index_compaction() { } else { DCHECK(false) << err_msg; } + // log here just for debugging, do not return error LOG(WARNING) << err_msg; - return Status::InternalError(err_msg); } } @@ -642,9 +640,13 @@ Status Compaction::do_inverted_index_compaction() { // format: rowsetId_segmentId std::vector> inverted_index_file_writers( dest_segment_num); - for (int i = 0; i < dest_segment_num; ++i) { + + // Some columns have already been indexed + // key: seg_id, value: inverted index file size + std::unordered_map compacted_idx_file_size; + for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { std::string index_path_prefix { - InvertedIndexDescriptor::get_index_file_path_prefix(ctx.segment_path(i))}; + InvertedIndexDescriptor::get_index_file_path_prefix(ctx.segment_path(seg_id))}; auto inverted_index_file_reader = std::make_unique( ctx.fs(), index_path_prefix, _cur_tablet_schema->get_inverted_index_storage_format()); @@ -654,16 +656,31 @@ Status Compaction::do_inverted_index_compaction() { if (st.ok()) { auto index_not_need_to_compact = DORIS_TRY(inverted_index_file_reader->get_all_directories()); + // V1: each index is a separate file + // V2: all indexes are in a single file + if (_cur_tablet_schema->get_inverted_index_storage_format() != + doris::InvertedIndexStorageFormatPB::V1) { + int64_t fsize = 0; + st = ctx.fs()->file_size( + InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix), &fsize); + if (!st.ok()) { + LOG(ERROR) << "file size error in index compaction, error:" << st.msg(); + return st; + } + compacted_idx_file_size[seg_id] = fsize; + } auto inverted_index_file_writer = std::make_unique( - ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), i, + ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), seg_id, _cur_tablet_schema->get_inverted_index_storage_format()); RETURN_IF_ERROR(inverted_index_file_writer->initialize(index_not_need_to_compact)); - inverted_index_file_writers[i] = std::move(inverted_index_file_writer); + inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); } else if (st.is()) { auto inverted_index_file_writer = std::make_unique( - ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), i, + ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), seg_id, _cur_tablet_schema->get_inverted_index_storage_format()); - inverted_index_file_writers[i] = std::move(inverted_index_file_writer); + inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); + // no index file + compacted_idx_file_size[seg_id] = 0; } else { LOG(ERROR) << "inverted_index_file_reader init failed in index compaction, error:" << st; @@ -744,11 +761,13 @@ Status Compaction::do_inverted_index_compaction() { } uint64_t inverted_index_file_size = 0; - for (auto& inverted_index_file_writer : inverted_index_file_writers) { + for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { + auto inverted_index_file_writer = inverted_index_file_writers[seg_id].get(); if (Status st = inverted_index_file_writer->close(); !st.ok()) { status = Status::Error(st.msg()); } else { inverted_index_file_size += inverted_index_file_writer->get_index_file_size(); + inverted_index_file_size -= compacted_idx_file_size[seg_id]; } } // check index compaction status. If status is not ok, we should return error and end this compaction round. @@ -886,7 +905,9 @@ Status CompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx) if (config::inverted_index_compaction_enable && (((_tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write()) || - _tablet->keys_type() == KeysType::DUP_KEYS))) { + _tablet->keys_type() == KeysType::DUP_KEYS)) && + _cur_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { construct_skip_inverted_index(ctx); } ctx.version = _output_version; @@ -1160,13 +1181,10 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { Status CloudCompactionMixin::execute_compact() { TEST_INJECTION_POINT("Compaction::do_compaction"); int64_t permits = get_compaction_permits(); - Status st = execute_compact_impl(permits); - if (!st.ok()) { - garbage_collection(); - return st; - } + HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(execute_compact_impl(permits), + [&](const doris::Exception& ex) { garbage_collection(); }); _load_segment_to_cache(); - return st; + return Status::OK(); } Status CloudCompactionMixin::modify_rowsets() { @@ -1178,7 +1196,9 @@ Status CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext& if (config::inverted_index_compaction_enable && (((_tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write()) || - _tablet->keys_type() == KeysType::DUP_KEYS))) { + _tablet->keys_type() == KeysType::DUP_KEYS)) && + _cur_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { construct_skip_inverted_index(ctx); } @@ -1198,8 +1218,10 @@ Status CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx.write_type = DataWriteType::TYPE_COMPACTION; auto compaction_policy = _tablet->tablet_meta()->compaction_policy(); - ctx.compaction_level = - _engine.cumu_compaction_policy(compaction_policy)->new_compaction_level(_input_rowsets); + if (_tablet->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + ctx.compaction_level = _engine.cumu_compaction_policy(compaction_policy) + ->new_compaction_level(_input_rowsets); + } ctx.write_file_cache = compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION; ctx.file_cache_ttl_sec = _tablet->ttl_seconds(); diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index 2c7e654787a650..40f6579dd995dc 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -35,16 +35,19 @@ namespace doris { using namespace ErrorCode; -namespace { - -void find_longest_consecutive_version(std::vector* rowsets, - std::vector* missing_version) { +void CumulativeCompaction::find_longest_consecutive_version(std::vector* rowsets, + std::vector* missing_version) { if (rowsets->empty()) { return; } RowsetSharedPtr prev_rowset = rowsets->front(); size_t i = 1; + int max_start = 0; + int max_length = 1; + + int start = 0; + int length = 1; for (; i < rowsets->size(); ++i) { RowsetSharedPtr rowset = (*rowsets)[i]; if (rowset->start_version() != prev_rowset->end_version() + 1) { @@ -52,16 +55,22 @@ void find_longest_consecutive_version(std::vector* rowsets, missing_version->push_back(prev_rowset->version()); missing_version->push_back(rowset->version()); } - break; + start = i; + length = 1; + } else { + length++; + } + + if (length > max_length) { + max_start = start; + max_length = length; } + prev_rowset = rowset; } - - rowsets->resize(i); + *rowsets = {rowsets->begin() + max_start, rowsets->begin() + max_start + max_length}; } -} // namespace - CumulativeCompaction::CumulativeCompaction(StorageEngine& engine, const TabletSharedPtr& tablet) : CompactionMixin(engine, tablet, "CumulativeCompaction:" + std::to_string(tablet->tablet_id())) {} @@ -100,9 +109,10 @@ Status CumulativeCompaction::execute_compact() { RETURN_IF_ERROR(CompactionMixin::execute_compact()); DCHECK_EQ(_state, CompactionState::SUCCESS); - - tablet()->cumulative_compaction_policy()->update_compaction_level(tablet(), _input_rowsets, - _output_rowset); + if (tablet()->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + tablet()->cumulative_compaction_policy()->update_compaction_level(tablet(), _input_rowsets, + _output_rowset); + } tablet()->cumulative_compaction_policy()->update_cumulative_point( tablet(), _input_rowsets, _output_rowset, _last_delete_version); @@ -127,10 +137,11 @@ Status CumulativeCompaction::pick_rowsets_to_compact() { std::vector missing_versions; find_longest_consecutive_version(&candidate_rowsets, &missing_versions); if (!missing_versions.empty()) { - DCHECK(missing_versions.size() == 2); + DCHECK(missing_versions.size() % 2 == 0); LOG(WARNING) << "There are missed versions among rowsets. " - << "prev rowset verison=" << missing_versions[0] - << ", next rowset version=" << missing_versions[1] + << "total missed version size: " << missing_versions.size() / 2 + << " first missed version prev rowset verison=" << missing_versions[0] + << ", first missed version next rowset version=" << missing_versions[1] << ", tablet=" << _tablet->tablet_id(); } diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h index 14527bf2faba60..276e3b3490311c 100644 --- a/be/src/olap/cumulative_compaction.h +++ b/be/src/olap/cumulative_compaction.h @@ -44,6 +44,9 @@ class CumulativeCompaction final : public CompactionMixin { Status pick_rowsets_to_compact(); + void find_longest_consecutive_version(std::vector* rowsets, + std::vector* missing_version); + Version _last_delete_version {-1, -1}; }; diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index d40e7faafc23e0..6819d7d90f3ef7 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -96,7 +96,7 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, dp->param("error_msg")); }) if (conditions.empty()) { - return Status::Error( + return Status::Error( "invalid parameters for store_cond. condition_size={}", conditions.size()); } @@ -127,7 +127,7 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { LOG(WARNING) << "failed to parse condition_str, condtion=" << ThriftDebugString(condition); - return Status::Error( + return Status::Error( "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); } VLOG_NOTICE << __PRETTY_FUNCTION__ << " condition_str: " << condition_str; @@ -235,8 +235,8 @@ Status DeleteHandler::check_condition_valid(const TabletSchema& schema, const TC // Check whether the column exists int32_t field_index = schema.field_index(cond.column_name); if (field_index < 0) { - return Status::Error("field is not existent. [field_index={}]", - field_index); + return Status::Error("field is not existent. [field_index={}]", + field_index); } // Delete condition should only applied on key columns or duplicate key table, and @@ -245,21 +245,21 @@ Status DeleteHandler::check_condition_valid(const TabletSchema& schema, const TC if (column.type() == FieldType::OLAP_FIELD_TYPE_DOUBLE || column.type() == FieldType::OLAP_FIELD_TYPE_FLOAT) { - return Status::Error("data type is float or double."); + return Status::Error("data type is float or double."); } // Check operator and operands size are matched. if ("*=" != cond.condition_op && "!*=" != cond.condition_op && cond.condition_values.size() != 1) { - return Status::Error("invalid condition value size. [size={}]", - cond.condition_values.size()); + return Status::Error("invalid condition value size. [size={}]", + cond.condition_values.size()); } // Check each operand is valid for (const auto& condition_value : cond.condition_values) { if (!is_condition_value_valid(column, cond.condition_op, condition_value)) { - return Status::Error("invalid condition value. [value={}]", - condition_value); + return Status::Error("invalid condition value. [value={}]", + condition_value); } } @@ -273,15 +273,16 @@ Status DeleteHandler::check_condition_valid(const TabletSchema& schema, const TC const auto& err_msg = fmt::format("column id does not exists in table={}, schema version={},", schema.table_id(), schema.schema_version()); - return Status::Error(err_msg); + return Status::Error(err_msg); } if (!iequal(schema.column_by_uid(cond.column_unique_id).name(), cond.column_name)) { const auto& err_msg = fmt::format( - "colum name={} does not belongs to column uid={}, which column name={}, " + "colum name={} does not belongs to column uid={}, which " + "column name={}, " "delete_cond.column_name ={}", cond.column_name, cond.column_unique_id, schema.column_by_uid(cond.column_unique_id).name(), cond.column_name); - return Status::Error(err_msg); + return Status::Error(err_msg); } return Status::OK(); @@ -289,7 +290,7 @@ Status DeleteHandler::check_condition_valid(const TabletSchema& schema, const TC Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCondition* condition) { if (!sub_cond.has_column_name() || !sub_cond.has_op() || !sub_cond.has_cond_value()) { - return Status::Error( + return Status::Error( "fail to parse condition. condition={} {} {}", sub_cond.column_name(), sub_cond.op(), sub_cond.cond_value()); } @@ -335,8 +336,8 @@ Status DeleteHandler::parse_condition(const std::string& condition_str, TConditi << "]"; } if (!matched) { - return Status::Error("fail to sub condition. condition={}", - condition_str); + return Status::Error("fail to sub condition. condition={}", + condition_str); } condition->column_name = what[1].str(); diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 0332e3f2e319d7..683e38775f34c2 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -51,9 +51,9 @@ Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& nam if (iterator == nullptr) { return Status::OK(); } - if (_skip_evaluate(iterator)) { - return Status::Error( - "match predicate evaluate skipped."); + if (_check_evaluate(iterator)) { + return Status::Error( + "phrase queries require setting support_phrase = true"); } auto type = name_with_type.second; const std::string& name = name_with_type.first; @@ -122,13 +122,14 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m return ret; } -bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || - _match_type == MatchType::MATCH_PHRASE_EDGE) && - iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && - get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { - return true; +bool MatchPredicate::_check_evaluate(InvertedIndexIterator* iterator) const { + if (_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || + _match_type == MatchType::MATCH_PHRASE_EDGE) { + if (iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && + get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { + return true; + } } return false; } diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 17d8e76ac88e11..ad202b7b2427cf 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -79,7 +79,7 @@ class MatchPredicate : public ColumnPredicate { std::string info = "MatchPredicate"; return info; } - bool _skip_evaluate(InvertedIndexIterator* iterator) const; + bool _check_evaluate(InvertedIndexIterator* iterator) const; private: std::string _value; diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index e55fd678bd21bc..207778becae4d6 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -48,7 +48,7 @@ bvar::Adder g_memtable_input_block_allocated_size("memtable_input_block using namespace ErrorCode; -MemTable::MemTable(int64_t tablet_id, const TabletSchema* tablet_schema, +MemTable::MemTable(int64_t tablet_id, std::shared_ptr tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info, const std::shared_ptr& insert_mem_tracker, @@ -182,19 +182,21 @@ Status MemTable::insert(const vectorized::Block* input_block, const std::vector& row_idxs) { if (_is_first_insertion) { _is_first_insertion = false; - auto cloneBlock = input_block->clone_without_columns(&_column_offset); - _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + auto clone_block = input_block->clone_without_columns(&_column_offset); + _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&clone_block); _vec_row_comparator->set_block(&_input_mutable_block); - _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&clone_block); if (_keys_type != KeysType::DUP_KEYS) { - _init_agg_functions(input_block); + // there may be additional intermediate columns in input_block + // we only need columns indicated by column offset in the output + _init_agg_functions(&clone_block); } if (_tablet_schema->has_sequence_col()) { if (_is_partial_update) { // for unique key partial update, sequence column index in block // may be different with the index in `_tablet_schema` - for (size_t i = 0; i < cloneBlock.columns(); i++) { - if (cloneBlock.get_by_position(i).name == SEQUENCE_COL) { + for (size_t i = 0; i < clone_block.columns(); i++) { + if (clone_block.get_by_position(i).name == SEQUENCE_COL) { _seq_col_idx_in_block = i; break; } diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index d2dfafd972a20d..916067ba1193d2 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -129,7 +129,8 @@ class Tie { class RowInBlockComparator { public: - RowInBlockComparator(const TabletSchema* tablet_schema) : _tablet_schema(tablet_schema) {} + RowInBlockComparator(std::shared_ptr tablet_schema) + : _tablet_schema(tablet_schema) {} // call set_block before operator(). // only first time insert block to create _input_mutable_block, // so can not Comparator of construct to set pblock @@ -137,7 +138,7 @@ class RowInBlockComparator { int operator()(const RowInBlock* left, const RowInBlock* right) const; private: - const TabletSchema* _tablet_schema = nullptr; + std::shared_ptr _tablet_schema; vectorized::MutableBlock* _pblock = nullptr; // corresponds to Memtable::_input_mutable_block }; @@ -168,7 +169,7 @@ class MemTableStat { class MemTable { public: - MemTable(int64_t tablet_id, const TabletSchema* tablet_schema, + MemTable(int64_t tablet_id, std::shared_ptr tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info, const std::shared_ptr& insert_mem_tracker, @@ -209,7 +210,7 @@ class MemTable { bool _enable_unique_key_mow = false; bool _is_partial_update = false; const KeysType _keys_type; - const TabletSchema* _tablet_schema = nullptr; + std::shared_ptr _tablet_schema; std::shared_ptr _vec_row_comparator; diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index dc9545d0b34d1c..cfcac0bcb8f8b3 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -43,8 +43,10 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(flush_thread_pool_thread_num, MetricUnit::NOU bvar::Adder g_flush_task_num("memtable_flush_task_num"); class MemtableFlushTask final : public Runnable { + ENABLE_FACTORY_CREATOR(MemtableFlushTask); + public: - MemtableFlushTask(FlushToken* flush_token, std::unique_ptr memtable, + MemtableFlushTask(std::shared_ptr flush_token, std::unique_ptr memtable, int32_t segment_id, int64_t submit_task_time) : _flush_token(flush_token), _memtable(std::move(memtable)), @@ -56,11 +58,16 @@ class MemtableFlushTask final : public Runnable { ~MemtableFlushTask() override { g_flush_task_num << -1; } void run() override { - _flush_token->_flush_memtable(std::move(_memtable), _segment_id, _submit_task_time); + auto token = _flush_token.lock(); + if (token) { + token->_flush_memtable(std::move(_memtable), _segment_id, _submit_task_time); + } else { + LOG(WARNING) << "flush token is deconstructed, ignore the flush task"; + } } private: - FlushToken* _flush_token; + std::weak_ptr _flush_token; std::unique_ptr _memtable; int32_t _segment_id; int64_t _submit_task_time; @@ -91,8 +98,9 @@ Status FlushToken::submit(std::unique_ptr mem_table) { return Status::OK(); } int64_t submit_task_time = MonotonicNanos(); - auto task = std::make_shared( - this, std::move(mem_table), _rowset_writer->allocate_segment_id(), submit_task_time); + auto task = MemtableFlushTask::create_shared(shared_from_this(), std::move(mem_table), + _rowset_writer->allocate_segment_id(), + submit_task_time); Status ret = _thread_pool->submit(std::move(task)); if (ret.ok()) { // _wait_running_task_finish was executed after this function, so no need to notify _cond here @@ -132,6 +140,7 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in SCOPED_RAW_TIMER(&duration_ns); SCOPED_ATTACH_TASK(memtable->query_thread_context()); signal::set_signal_task_id(_rowset_writer->load_id()); + signal::tablet_id = memtable->tablet_id(); { SCOPED_CONSUME_MEM_TRACKER(memtable->flush_mem_tracker()); std::unique_ptr block; @@ -224,8 +233,8 @@ void MemTableFlushExecutor::init(int num_disk) { } // NOTE: we use SERIAL mode here to ensure all mem-tables from one tablet are flushed in order. -Status MemTableFlushExecutor::create_flush_token(std::unique_ptr& flush_token, - RowsetWriter* rowset_writer, +Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& flush_token, + std::shared_ptr rowset_writer, bool is_high_priority) { switch (rowset_writer->type()) { case ALPHA_ROWSET: @@ -234,7 +243,7 @@ Status MemTableFlushExecutor::create_flush_token(std::unique_ptr& fl case BETA_ROWSET: { // beta rowset can be flush in CONCURRENT, because each memtable using a new segment writer. ThreadPool* pool = is_high_priority ? _high_prio_flush_pool.get() : _flush_pool.get(); - flush_token = std::make_unique(pool); + flush_token = FlushToken::create_shared(pool); flush_token->set_rowset_writer(rowset_writer); return Status::OK(); } @@ -243,11 +252,11 @@ Status MemTableFlushExecutor::create_flush_token(std::unique_ptr& fl } } -Status MemTableFlushExecutor::create_flush_token(std::unique_ptr& flush_token, - RowsetWriter* rowset_writer, +Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& flush_token, + std::shared_ptr rowset_writer, ThreadPool* wg_flush_pool_ptr) { if (rowset_writer->type() == BETA_ROWSET) { - flush_token = std::make_unique(wg_flush_pool_ptr); + flush_token = FlushToken::create_shared(wg_flush_pool_ptr); } else { return Status::InternalError("not support alpha rowset load now."); } diff --git a/be/src/olap/memtable_flush_executor.h b/be/src/olap/memtable_flush_executor.h index b647c23deb9aae..2d20298f800a37 100644 --- a/be/src/olap/memtable_flush_executor.h +++ b/be/src/olap/memtable_flush_executor.h @@ -55,10 +55,11 @@ std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat); // 1. Immediately disallow submission of any subsequent memtable // 2. For the memtables that have already been submitted, there is no need to flush, // because the entire job will definitely fail; -class FlushToken { +class FlushToken : public std::enable_shared_from_this { + ENABLE_FACTORY_CREATOR(FlushToken); + public: - explicit FlushToken(ThreadPool* thread_pool) - : _flush_status(Status::OK()), _thread_pool(thread_pool) {} + FlushToken(ThreadPool* thread_pool) : _flush_status(Status::OK()), _thread_pool(thread_pool) {} Status submit(std::unique_ptr mem_table); @@ -72,7 +73,9 @@ class FlushToken { // get flush operations' statistics const FlushStatistic& get_stats() const { return _stats; } - void set_rowset_writer(RowsetWriter* rowset_writer) { _rowset_writer = rowset_writer; } + void set_rowset_writer(std::shared_ptr rowset_writer) { + _rowset_writer = rowset_writer; + } const MemTableStat& memtable_stat() { return _memtable_stat; } @@ -96,7 +99,7 @@ class FlushToken { FlushStatistic _stats; - RowsetWriter* _rowset_writer = nullptr; + std::shared_ptr _rowset_writer = nullptr; MemTableStat _memtable_stat; @@ -129,10 +132,11 @@ class MemTableFlushExecutor { // because it needs path hash of each data dir. void init(int num_disk); - Status create_flush_token(std::unique_ptr& flush_token, RowsetWriter* rowset_writer, - bool is_high_priority); + Status create_flush_token(std::shared_ptr& flush_token, + std::shared_ptr rowset_writer, bool is_high_priority); - Status create_flush_token(std::unique_ptr& flush_token, RowsetWriter* rowset_writer, + Status create_flush_token(std::shared_ptr& flush_token, + std::shared_ptr rowset_writer, ThreadPool* wg_flush_pool_ptr); private: diff --git a/be/src/olap/memtable_writer.cpp b/be/src/olap/memtable_writer.cpp index 13bbff325394a3..114a7841b92204 100644 --- a/be/src/olap/memtable_writer.cpp +++ b/be/src/olap/memtable_writer.cpp @@ -78,17 +78,17 @@ Status MemTableWriter::init(std::shared_ptr rowset_writer, // by assigning segment_id to memtable before submiting to flush executor, // we can make sure same keys sort in the same order in all replicas. if (wg_flush_pool_ptr) { - RETURN_IF_ERROR(ExecEnv::GetInstance() - ->storage_engine() - .memtable_flush_executor() - ->create_flush_token(_flush_token, _rowset_writer.get(), - wg_flush_pool_ptr)); + RETURN_IF_ERROR( + ExecEnv::GetInstance() + ->storage_engine() + .memtable_flush_executor() + ->create_flush_token(_flush_token, _rowset_writer, wg_flush_pool_ptr)); } else { - RETURN_IF_ERROR(ExecEnv::GetInstance() - ->storage_engine() - .memtable_flush_executor() - ->create_flush_token(_flush_token, _rowset_writer.get(), - _req.is_high_priority)); + RETURN_IF_ERROR( + ExecEnv::GetInstance() + ->storage_engine() + .memtable_flush_executor() + ->create_flush_token(_flush_token, _rowset_writer, _req.is_high_priority)); } _is_init = true; @@ -213,8 +213,8 @@ void MemTableWriter::_reset_mem_table() { } { std::lock_guard l(_mem_table_ptr_lock); - _mem_table.reset(new MemTable(_req.tablet_id, _tablet_schema.get(), _req.slots, - _req.tuple_desc, _unique_key_mow, _partial_update_info.get(), + _mem_table.reset(new MemTable(_req.tablet_id, _tablet_schema, _req.slots, _req.tuple_desc, + _unique_key_mow, _partial_update_info.get(), mem_table_insert_tracker, mem_table_flush_tracker)); } diff --git a/be/src/olap/memtable_writer.h b/be/src/olap/memtable_writer.h index b34fe0baee4a76..ee7c8e1538a19b 100644 --- a/be/src/olap/memtable_writer.h +++ b/be/src/olap/memtable_writer.h @@ -127,7 +127,9 @@ class MemTableWriter { TabletSchemaSPtr _tablet_schema; bool _unique_key_mow = false; - std::unique_ptr _flush_token; + // This variable is accessed from writer thread and token flush thread + // use a shared ptr to avoid use after free problem. + std::shared_ptr _flush_token; std::vector> _mem_table_insert_trackers; std::vector> _mem_table_flush_trackers; SpinLock _mem_table_tracker_lock; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ad70241ad87cea..87792db93a6645 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -336,16 +336,12 @@ Status Merger::vertical_compact_one_group( } // for segcompaction -Status Merger::vertical_compact_one_group(int64_t tablet_id, ReaderType reader_type, - const TabletSchema& tablet_schema, bool is_key, - const std::vector& column_group, - vectorized::RowSourcesBuffer* row_source_buf, - vectorized::VerticalBlockReader& src_block_reader, - segment_v2::SegmentWriter& dst_segment_writer, - int64_t max_rows_per_segment, Statistics* stats_output, - uint64_t* index_size, KeyBoundsPB& key_bounds) { - // build tablet reader - VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment; +Status Merger::vertical_compact_one_group( + int64_t tablet_id, ReaderType reader_type, const TabletSchema& tablet_schema, bool is_key, + const std::vector& column_group, vectorized::RowSourcesBuffer* row_source_buf, + vectorized::VerticalBlockReader& src_block_reader, + segment_v2::SegmentWriter& dst_segment_writer, Statistics* stats_output, + uint64_t* index_size, KeyBoundsPB& key_bounds, SimpleRowIdConversion* rowid_conversion) { // TODO: record_rowids vectorized::Block block = tablet_schema.create_block(column_group); size_t output_rows = 0; @@ -362,6 +358,9 @@ Status Merger::vertical_compact_one_group(int64_t tablet_id, ReaderType reader_t "failed to write block when merging rowsets of tablet " + std::to_string(tablet_id)); + if (is_key && rowid_conversion != nullptr) { + rowid_conversion->add(src_block_reader.current_block_row_locations()); + } output_rows += block.rows(); block.clear_column_data(); } diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index 7513c90fbd1217..cb05162b3bc9a1 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -23,6 +23,7 @@ #include "io/io_common.h" #include "olap/iterators.h" #include "olap/rowset/rowset_fwd.h" +#include "olap/simple_rowid_conversion.h" #include "olap/tablet_fwd.h" namespace doris { @@ -82,8 +83,9 @@ class Merger { vectorized::RowSourcesBuffer* row_source_buf, vectorized::VerticalBlockReader& src_block_reader, segment_v2::SegmentWriter& dst_segment_writer, - int64_t max_rows_per_segment, Statistics* stats_output, - uint64_t* index_size, KeyBoundsPB& key_bounds); + Statistics* stats_output, uint64_t* index_size, + KeyBoundsPB& key_bounds, + SimpleRowIdConversion* rowid_conversion); }; } // namespace doris diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index c1a2e3c18b5b66..fee58a87501d66 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -54,6 +54,11 @@ using TabletUid = UniqueId; enum CompactionType { BASE_COMPACTION = 1, CUMULATIVE_COMPACTION = 2, FULL_COMPACTION = 3 }; +enum DataDirType { + SPILL_DISK_DIR, + OLAP_DATA_DIR, +}; + struct DataDirInfo { std::string path; size_t path_hash = 0; @@ -64,6 +69,7 @@ struct DataDirInfo { int64_t trash_used_capacity = 0; bool is_used = false; // whether available mark TStorageMedium::type storage_medium = TStorageMedium::HDD; // Storage medium type: SSD|HDD + DataDirType data_dir_type = DataDirType::OLAP_DATA_DIR; }; struct PredicateFilterInfo { int type = 0; @@ -361,10 +367,13 @@ struct OlapReaderStatistics { int64_t inverted_index_query_timer = 0; int64_t inverted_index_query_cache_hit = 0; int64_t inverted_index_query_cache_miss = 0; + int64_t inverted_index_query_null_bitmap_timer = 0; int64_t inverted_index_query_bitmap_copy_timer = 0; int64_t inverted_index_query_bitmap_op_timer = 0; int64_t inverted_index_searcher_open_timer = 0; int64_t inverted_index_searcher_search_timer = 0; + int64_t inverted_index_searcher_cache_hit = 0; + int64_t inverted_index_searcher_cache_miss = 0; int64_t output_index_result_column_timer = 0; // number of segment filtered by column stat when creating seg iterator diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index 9cd15ff334b3af..7c88156f74cef5 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -18,7 +18,10 @@ #include #include #include +#include +#include #include +#include #include #include @@ -26,6 +29,7 @@ #include // IWYU pragma: keep #include #include +#include #include #include #include @@ -35,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -45,9 +49,7 @@ #include "common/logging.h" #include "common/status.h" #include "cpp/sync_point.h" -#include "gen_cpp/BackendService.h" #include "gen_cpp/FrontendService.h" -#include "gen_cpp/Types_constants.h" #include "gen_cpp/internal_service.pb.h" #include "gutil/ref_counted.h" #include "io/fs/file_writer.h" // IWYU pragma: keep @@ -75,9 +77,6 @@ #include "runtime/client_cache.h" #include "runtime/memory/cache_manager.h" #include "runtime/memory/global_memory_arbitrator.h" -#include "service/brpc.h" -#include "service/point_query_executor.h" -#include "util/brpc_client_cache.h" #include "util/countdown_latch.h" #include "util/doris_metrics.h" #include "util/mem_info.h" @@ -100,6 +99,89 @@ volatile uint32_t g_schema_change_active_threads = 0; static const uint64_t DEFAULT_SEED = 104729; static const uint64_t MOD_PRIME = 7652413; +CompactionSubmitRegistry::CompactionSubmitRegistry(CompactionSubmitRegistry&& r) { + std::swap(_tablet_submitted_cumu_compaction, r._tablet_submitted_cumu_compaction); + std::swap(_tablet_submitted_base_compaction, r._tablet_submitted_base_compaction); + std::swap(_tablet_submitted_full_compaction, r._tablet_submitted_full_compaction); +} + +CompactionSubmitRegistry CompactionSubmitRegistry::create_snapshot() { + // full compaction is not engaged in this method + std::unique_lock l(_tablet_submitted_compaction_mutex); + CompactionSubmitRegistry registry; + registry._tablet_submitted_base_compaction = _tablet_submitted_base_compaction; + registry._tablet_submitted_cumu_compaction = _tablet_submitted_cumu_compaction; + return registry; +} + +void CompactionSubmitRegistry::reset(const std::vector& stores) { + // full compaction is not engaged in this method + for (const auto& store : stores) { + _tablet_submitted_cumu_compaction[store] = {}; + _tablet_submitted_base_compaction[store] = {}; + } +} + +uint32_t CompactionSubmitRegistry::count_executing_compaction(DataDir* dir, + CompactionType compaction_type) { + // non-lock, used in snapshot + const auto& compaction_tasks = _get_tablet_set(dir, compaction_type); + return std::count_if(compaction_tasks.begin(), compaction_tasks.end(), [](const auto& task) { + return task->compaction_stage == CompactionStage::EXECUTING; + }); +} + +uint32_t CompactionSubmitRegistry::count_executing_cumu_and_base(DataDir* dir) { + // non-lock, used in snapshot + return count_executing_compaction(dir, CompactionType::BASE_COMPACTION) + + count_executing_compaction(dir, CompactionType::CUMULATIVE_COMPACTION); +} + +bool CompactionSubmitRegistry::has_compaction_task(DataDir* dir, CompactionType compaction_type) { + // non-lock, used in snapshot + return !_get_tablet_set(dir, compaction_type).empty(); +} + +std::vector CompactionSubmitRegistry::pick_topn_tablets_for_compaction( + TabletManager* tablet_mgr, DataDir* data_dir, CompactionType compaction_type, + const CumuCompactionPolicyTable& cumu_compaction_policies, uint32_t* disk_max_score) { + // non-lock, used in snapshot + return tablet_mgr->find_best_tablets_to_compaction(compaction_type, data_dir, + _get_tablet_set(data_dir, compaction_type), + disk_max_score, cumu_compaction_policies); +} + +bool CompactionSubmitRegistry::insert(TabletSharedPtr tablet, CompactionType compaction_type) { + std::unique_lock l(_tablet_submitted_compaction_mutex); + auto& tablet_set = _get_tablet_set(tablet->data_dir(), compaction_type); + bool already_exist = !(tablet_set.insert(tablet).second); + return already_exist; +} + +void CompactionSubmitRegistry::remove(TabletSharedPtr tablet, CompactionType compaction_type, + std::function wakeup_cb) { + std::unique_lock l(_tablet_submitted_compaction_mutex); + auto& tablet_set = _get_tablet_set(tablet->data_dir(), compaction_type); + size_t removed = tablet_set.erase(tablet); + if (removed == 1) { + wakeup_cb(); + } +} + +CompactionSubmitRegistry::TabletSet& CompactionSubmitRegistry::_get_tablet_set( + DataDir* dir, CompactionType compaction_type) { + switch (compaction_type) { + case CompactionType::BASE_COMPACTION: + return _tablet_submitted_base_compaction[dir]; + case CompactionType::CUMULATIVE_COMPACTION: + return _tablet_submitted_cumu_compaction[dir]; + case CompactionType::FULL_COMPACTION: + return _tablet_submitted_full_compaction[dir]; + default: + CHECK(false) << "invalid compaction type"; + } +} + static int32_t get_cumu_compaction_threads_num(size_t data_dirs_num) { int32_t threads_num = config::max_cumu_compaction_threads; if (threads_num == -1) { @@ -219,12 +301,6 @@ Status StorageEngine::start_bg_threads() { [this]() { this->_tablet_path_check_callback(); }, &_tablet_path_check_thread)); LOG(INFO) << "tablet path check thread started"; - // cache clean thread - RETURN_IF_ERROR(Thread::create( - "StorageEngine", "cache_clean_thread", [this]() { this->_cache_clean_callback(); }, - &_cache_clean_thread)); - LOG(INFO) << "cache clean thread started"; - // path scan and gc thread if (config::path_gc_check) { for (auto data_dir : get_stores()) { @@ -277,42 +353,6 @@ Status StorageEngine::start_bg_threads() { return Status::OK(); } -void StorageEngine::_cache_clean_callback() { - int32_t interval = config::cache_periodic_prune_stale_sweep_sec; - while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) { - if (interval <= 0) { - LOG(WARNING) << "config of cache clean interval is illegal: [" << interval - << "], force set to 3600 "; - interval = 3600; - } - if (config::disable_memory_gc) { - continue; - } - - CacheManager::instance()->for_each_cache_prune_stale(); - - // Dynamically modify the config to clear the cache, each time the disable cache will only be cleared once. - if (config::disable_segment_cache) { - if (!_clear_segment_cache) { - CacheManager::instance()->clear_once(CachePolicy::CacheType::SEGMENT_CACHE); - _clear_segment_cache = true; - } - } else { - _clear_segment_cache = false; - } - if (config::disable_storage_page_cache) { - if (!_clear_page_cache) { - CacheManager::instance()->clear_once(CachePolicy::CacheType::DATA_PAGE_CACHE); - CacheManager::instance()->clear_once(CachePolicy::CacheType::INDEXPAGE_CACHE); - CacheManager::instance()->clear_once(CachePolicy::CacheType::PK_INDEX_PAGE_CACHE); - _clear_page_cache = true; - } - } else { - _clear_page_cache = false; - } - } -} - void StorageEngine::_garbage_sweeper_thread_callback() { uint32_t max_interval = config::max_garbage_sweep_interval; uint32_t min_interval = config::min_garbage_sweep_interval; @@ -371,23 +411,6 @@ void StorageEngine::_disk_stat_monitor_thread_callback() { } while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))); } -void StorageEngine::check_cumulative_compaction_config() { - int64_t promotion_size = config::compaction_promotion_size_mbytes; - int64_t promotion_min_size = config::compaction_promotion_min_size_mbytes; - int64_t compaction_min_size = config::compaction_min_size_mbytes; - - // check size_based_promotion_size must be greater than size_based_promotion_min_size and 2 * size_based_compaction_lower_bound_size - int64_t should_min_promotion_size = std::max(promotion_min_size, 2 * compaction_min_size); - - if (promotion_size < should_min_promotion_size) { - promotion_size = should_min_promotion_size; - LOG(WARNING) << "the config promotion_size is adjusted to " - "promotion_min_size or 2 * " - "compaction_min_size " - << should_min_promotion_size << ", because size_based_promotion_size is small"; - } -} - void StorageEngine::_unused_rowset_monitor_thread_callback() { int32_t interval = config::unused_rowset_monitor_interval; do { @@ -605,13 +628,8 @@ void StorageEngine::_adjust_compaction_thread_num() { void StorageEngine::_compaction_tasks_producer_callback() { LOG(INFO) << "try to start compaction producer process!"; - std::unordered_set tablet_submitted_cumu; - std::unordered_set tablet_submitted_base; std::vector data_dirs = get_stores(); - for (auto& data_dir : data_dirs) { - _tablet_submitted_cumu_compaction[data_dir] = tablet_submitted_cumu; - _tablet_submitted_base_compaction[data_dir] = tablet_submitted_base; - } + _compaction_submit_registry.reset(data_dirs); int round = 0; CompactionType compaction_type; @@ -783,13 +801,13 @@ Status StorageEngine::_submit_single_replica_compaction_task(TabletSharedPtr tab // Therefore, it is currently not possible to determine whether it should be a base compaction or cumulative compaction. // As a result, the tablet needs to be pushed to both the _tablet_submitted_cumu_compaction and the _tablet_submitted_base_compaction simultaneously. bool already_exist = - _push_tablet_into_submitted_compaction(tablet, CompactionType::CUMULATIVE_COMPACTION); + _compaction_submit_registry.insert(tablet, CompactionType::CUMULATIVE_COMPACTION); if (already_exist) { return Status::AlreadyExist( "compaction task has already been submitted, tablet_id={}", tablet->tablet_id()); } - already_exist = _push_tablet_into_submitted_compaction(tablet, CompactionType::BASE_COMPACTION); + already_exist = _compaction_submit_registry.insert(tablet, CompactionType::BASE_COMPACTION); if (already_exist) { _pop_tablet_from_submitted_compaction(tablet, CompactionType::CUMULATIVE_COMPACTION); return Status::AlreadyExist( @@ -847,12 +865,17 @@ void StorageEngine::get_tablet_rowset_versions(const PGetTabletVersionsRequest* response->mutable_status()->set_status_code(0); } -bool need_generate_compaction_tasks(int count, int thread_per_disk, CompactionType compaction_type, - bool all_base) { - if (count >= thread_per_disk) { +bool need_generate_compaction_tasks(int task_cnt_per_disk, int thread_per_disk, + CompactionType compaction_type, bool all_base) { + // We need to reserve at least one Slot for cumulative compaction. + // So when there is only one Slot, we have to judge whether there is a cumulative compaction + // in the current submitted tasks. + // If so, the last Slot can be assigned to Base compaction, + // otherwise, this Slot needs to be reserved for cumulative compaction. + if (task_cnt_per_disk >= thread_per_disk) { // Return if no available slot return false; - } else if (count >= thread_per_disk - 1) { + } else if (task_cnt_per_disk >= thread_per_disk - 1) { // Only one slot left, check if it can be assigned to base compaction task. if (compaction_type == CompactionType::BASE_COMPACTION) { if (all_base) { @@ -886,15 +909,17 @@ int get_concurrent_per_disk(int max_score, int thread_per_disk) { return thread_per_disk; } -int StorageEngine::_get_executing_compaction_num( - std::unordered_set& compaction_tasks) { - int num = 0; - for (const auto& task : compaction_tasks) { - if (task->compaction_stage == CompactionStage::EXECUTING) { - num++; - } - } - return num; +int32_t disk_compaction_slot_num(const DataDir& data_dir) { + return data_dir.is_ssd_disk() ? config::compaction_task_num_per_fast_disk + : config::compaction_task_num_per_disk; +} + +bool has_free_compaction_slot(CompactionSubmitRegistry* registry, DataDir* dir, + CompactionType compaction_type, uint32_t executing_cnt) { + int32_t thread_per_disk = disk_compaction_slot_num(*dir); + return need_generate_compaction_tasks( + executing_cnt, thread_per_disk, compaction_type, + !registry->has_compaction_task(dir, CompactionType::CUMULATIVE_COMPACTION)); } std::vector StorageEngine::_generate_compaction_tasks( @@ -909,27 +934,13 @@ std::vector StorageEngine::_generate_compaction_tasks( // Copy _tablet_submitted_xxx_compaction map so that we don't need to hold _tablet_submitted_compaction_mutex // when traversing the data dir - std::map> copied_cumu_map; - std::map> copied_base_map; - { - std::unique_lock lock(_tablet_submitted_compaction_mutex); - copied_cumu_map = _tablet_submitted_cumu_compaction; - copied_base_map = _tablet_submitted_base_compaction; - } - for (auto data_dir : data_dirs) { + auto compaction_registry_snapshot = _compaction_submit_registry.create_snapshot(); + for (auto* data_dir : data_dirs) { bool need_pick_tablet = true; - // We need to reserve at least one Slot for cumulative compaction. - // So when there is only one Slot, we have to judge whether there is a cumulative compaction - // in the current submitted tasks. - // If so, the last Slot can be assigned to Base compaction, - // otherwise, this Slot needs to be reserved for cumulative compaction. - int count = _get_executing_compaction_num(copied_cumu_map[data_dir]) + - _get_executing_compaction_num(copied_base_map[data_dir]); - int thread_per_disk = data_dir->is_ssd_disk() ? config::compaction_task_num_per_fast_disk - : config::compaction_task_num_per_disk; - - need_pick_tablet = need_generate_compaction_tasks(count, thread_per_disk, compaction_type, - copied_cumu_map[data_dir].empty()); + uint32_t executing_task_num = + compaction_registry_snapshot.count_executing_cumu_and_base(data_dir); + need_pick_tablet = has_free_compaction_slot(&compaction_registry_snapshot, data_dir, + compaction_type, executing_task_num); if (!need_pick_tablet && !check_score) { continue; } @@ -938,15 +949,15 @@ std::vector StorageEngine::_generate_compaction_tasks( // So that we can update the max_compaction_score metric. if (!data_dir->reach_capacity_limit(0)) { uint32_t disk_max_score = 0; - auto tablets = _tablet_manager->find_best_tablets_to_compaction( - compaction_type, data_dir, - compaction_type == CompactionType::CUMULATIVE_COMPACTION - ? copied_cumu_map[data_dir] - : copied_base_map[data_dir], - &disk_max_score, _cumulative_compaction_policies); - int concurrent_num = get_concurrent_per_disk(disk_max_score, thread_per_disk); + auto tablets = compaction_registry_snapshot.pick_topn_tablets_for_compaction( + _tablet_manager.get(), data_dir, compaction_type, + _cumulative_compaction_policies, &disk_max_score); + int concurrent_num = + get_concurrent_per_disk(disk_max_score, disk_compaction_slot_num(*data_dir)); need_pick_tablet = need_generate_compaction_tasks( - count, concurrent_num, compaction_type, copied_cumu_map[data_dir].empty()); + executing_task_num, concurrent_num, compaction_type, + !compaction_registry_snapshot.has_compaction_task( + data_dir, CompactionType::CUMULATIVE_COMPACTION)); for (const auto& tablet : tablets) { if (tablet != nullptr) { if (need_pick_tablet) { @@ -981,48 +992,13 @@ void StorageEngine::_update_cumulative_compaction_policy() { } } -bool StorageEngine::_push_tablet_into_submitted_compaction(TabletSharedPtr tablet, - CompactionType compaction_type) { - std::unique_lock lock(_tablet_submitted_compaction_mutex); - bool already_existed = false; - switch (compaction_type) { - case CompactionType::CUMULATIVE_COMPACTION: - already_existed = - !(_tablet_submitted_cumu_compaction[tablet->data_dir()].insert(tablet).second); - break; - case CompactionType::BASE_COMPACTION: - already_existed = - !(_tablet_submitted_base_compaction[tablet->data_dir()].insert(tablet).second); - break; - case CompactionType::FULL_COMPACTION: - already_existed = - !(_tablet_submitted_full_compaction[tablet->data_dir()].insert(tablet).second); - break; - } - return already_existed; -} - void StorageEngine::_pop_tablet_from_submitted_compaction(TabletSharedPtr tablet, CompactionType compaction_type) { - std::unique_lock lock(_tablet_submitted_compaction_mutex); - int removed = 0; - switch (compaction_type) { - case CompactionType::CUMULATIVE_COMPACTION: - removed = _tablet_submitted_cumu_compaction[tablet->data_dir()].erase(tablet); - break; - case CompactionType::BASE_COMPACTION: - removed = _tablet_submitted_base_compaction[tablet->data_dir()].erase(tablet); - break; - case CompactionType::FULL_COMPACTION: - removed = _tablet_submitted_full_compaction[tablet->data_dir()].erase(tablet); - break; - } - - if (removed == 1) { + _compaction_submit_registry.remove(tablet, compaction_type, [this]() { std::unique_lock lock(_compaction_producer_sleep_mutex); _wakeup_producer_flag = 1; _compaction_producer_sleep_cv.notify_one(); - } + }); } Status StorageEngine::_submit_compaction_task(TabletSharedPtr tablet, @@ -1039,7 +1015,7 @@ Status StorageEngine::_submit_compaction_task(TabletSharedPtr tablet, return Status::OK(); } - bool already_exist = _push_tablet_into_submitted_compaction(tablet, compaction_type); + bool already_exist = _compaction_submit_registry.insert(tablet, compaction_type); if (already_exist) { return Status::AlreadyExist( "compaction task has already been submitted, tablet_id={}, compaction_type={}.", @@ -1097,7 +1073,26 @@ Status StorageEngine::_submit_compaction_task(TabletSharedPtr tablet, } Status StorageEngine::submit_compaction_task(TabletSharedPtr tablet, CompactionType compaction_type, - bool force) { + bool force, bool eager) { + if (!eager) { + DCHECK(compaction_type == CompactionType::BASE_COMPACTION || + compaction_type == CompactionType::CUMULATIVE_COMPACTION); + auto compaction_registry_snapshot = _compaction_submit_registry.create_snapshot(); + auto stores = get_stores(); + + bool is_busy = std::none_of( + stores.begin(), stores.end(), + [&compaction_registry_snapshot, compaction_type](auto* data_dir) { + return has_free_compaction_slot( + &compaction_registry_snapshot, data_dir, compaction_type, + compaction_registry_snapshot.count_executing_cumu_and_base(data_dir)); + }); + if (is_busy) { + LOG_EVERY_N(WARNING, 100) + << "Too busy to submit a compaction task, tablet=" << tablet->get_table_id(); + return Status::OK(); + } + } _update_cumulative_compaction_policy(); // alter table tableName set ("compaction_policy"="time_series") // if atler table's compaction policy, we need to modify tablet compaction policy shared ptr diff --git a/be/src/olap/options.cpp b/be/src/olap/options.cpp index bf472b6ef52bd6..cd53e6c0b1ffa9 100644 --- a/be/src/olap/options.cpp +++ b/be/src/olap/options.cpp @@ -221,7 +221,7 @@ Status parse_conf_cache_paths(const std::string& config_path, std::vector& partial_update_cols, bool is_strict_mode, int64_t timestamp_ms, const std::string& timezone, - const std::string& auto_increment_column) { + const std::string& auto_increment_column, int64_t cur_max_version = -1) { is_partial_update = partial_update; partial_update_input_columns = partial_update_cols; - + max_version_in_flush_phase = cur_max_version; this->timestamp_ms = timestamp_ms; this->timezone = timezone; missing_cids.clear(); @@ -91,6 +91,7 @@ struct PartialUpdateInfo { public: bool is_partial_update {false}; + int64_t max_version_in_flush_phase {-1}; std::set partial_update_input_columns; std::vector missing_cids; std::vector update_cids; diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 248ed10d05cc00..feb7d24dda2705 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -117,6 +117,10 @@ Status PushHandler::_do_streaming_ingestion(TabletSharedPtr tablet, const TPushR } std::shared_lock base_migration_rlock(tablet->get_migration_lock(), std::try_to_lock); + DBUG_EXECUTE_IF("PushHandler::_do_streaming_ingestion.try_lock_fail", { + return Status::Error( + "PushHandler::_do_streaming_ingestion get lock failed"); + }) if (!base_migration_rlock.owns_lock()) { return Status::Error( "PushHandler::_do_streaming_ingestion get lock failed"); diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index a76cbe636eef90..6d917c78d956cc 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -40,6 +40,7 @@ #include "olap/rowset/beta_rowset_reader.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "olap/rowset/segment_v2/inverted_index_file_reader.h" #include "olap/tablet_schema.h" #include "olap/utils.h" #include "util/crc32c.h" @@ -174,6 +175,7 @@ Status BetaRowset::load_segment(int64_t seg_id, segment_v2::SegmentSharedPtr* se .cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE : io::FileCachePolicy::NO_CACHE, .is_doris_table = true, + .cache_base_path = "", .file_size = _rowset_meta->segment_file_size(seg_id), }; auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, reader_options, @@ -532,6 +534,7 @@ Status BetaRowset::check_current_rowset_segment() { .cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE : io::FileCachePolicy::NO_CACHE, .is_doris_table = true, + .cache_base_path {}, .file_size = _rowset_meta->segment_file_size(seg_id), }; auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, @@ -634,54 +637,51 @@ Status BetaRowset::add_to_binlog() { return Status::OK(); } -Status BetaRowset::calc_local_file_crc(uint32_t* crc_value, int64_t* file_count) { - if (!is_local()) { - DCHECK(false) << _rowset_meta->tablet_id() << ' ' << rowset_id(); - return Status::OK(); - } - +Status BetaRowset::calc_file_crc(uint32_t* crc_value, int64_t* file_count) { + const auto& fs = _rowset_meta->fs(); + DBUG_EXECUTE_IF("fault_inject::BetaRowset::calc_file_crc", + { return Status::Error("fault_inject calc_file_crc error"); }); if (num_segments() < 1) { *crc_value = 0x92a8fc17; // magic code from crc32c table return Status::OK(); } // 1. pick up all the files including dat file and idx file - std::vector local_paths; - for (int i = 0; i < num_segments(); ++i) { - auto local_seg_path = local_segment_path(_tablet_path, rowset_id().to_string(), i); - local_paths.emplace_back(local_seg_path); + std::vector file_paths; + for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { + auto seg_path = DORIS_TRY(segment_path(seg_id)); + file_paths.emplace_back(seg_path); if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { for (auto& column : _schema->columns()) { const TabletIndex* index_meta = _schema->get_inverted_index(*column); if (index_meta) { - std::string local_inverted_index_file = + std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_path_v1( - InvertedIndexDescriptor::get_index_file_path_prefix( - local_seg_path), + InvertedIndexDescriptor::get_index_file_path_prefix(seg_path), index_meta->index_id(), index_meta->get_index_suffix()); - local_paths.emplace_back(std::move(local_inverted_index_file)); + file_paths.emplace_back(std::move(inverted_index_file)); } } } else { if (_schema->has_inverted_index()) { - std::string local_inverted_index_file = - InvertedIndexDescriptor::get_index_file_path_v2( - InvertedIndexDescriptor::get_index_file_path_prefix( - local_seg_path)); - local_paths.emplace_back(std::move(local_inverted_index_file)); + std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)); + file_paths.emplace_back(std::move(inverted_index_file)); } } } + *crc_value = 0; + *file_count = file_paths.size(); + if (!is_local()) { + return Status::OK(); + } // 2. calculate the md5sum of each file const auto& local_fs = io::global_local_filesystem(); - DCHECK(!local_paths.empty()); + DCHECK(!file_paths.empty()); std::vector all_file_md5; - all_file_md5.reserve(local_paths.size()); - for (const auto& file_path : local_paths) { - DBUG_EXECUTE_IF("fault_inject::BetaRowset::calc_local_file_crc", { - return Status::Error("fault_inject calc_local_file_crc error"); - }); + all_file_md5.reserve(file_paths.size()); + for (const auto& file_path : file_paths) { std::string file_md5sum; auto status = local_fs->md5sum(file_path, &file_md5sum); if (!status.ok()) { @@ -694,9 +694,7 @@ Status BetaRowset::calc_local_file_crc(uint32_t* crc_value, int64_t* file_count) std::sort(all_file_md5.begin(), all_file_md5.end()); // 3. calculate the crc_value based on all_file_md5 - DCHECK(local_paths.size() == all_file_md5.size()); - *crc_value = 0; - *file_count = local_paths.size(); + DCHECK(file_paths.size() == all_file_md5.size()); for (auto& i : all_file_md5) { *crc_value = crc32c::Extend(*crc_value, i.data(), i.size()); } @@ -704,4 +702,129 @@ Status BetaRowset::calc_local_file_crc(uint32_t* crc_value, int64_t* file_count) return Status::OK(); } +Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, + rapidjson::Document::AllocatorType& allocator) { + const auto& fs = _rowset_meta->fs(); + auto storage_format = _schema->get_inverted_index_storage_format(); + auto format_str = storage_format == InvertedIndexStorageFormatPB::V1 ? "V1" : "V2"; + auto rs_id = rowset_id().to_string(); + rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(), allocator), allocator); + rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str, allocator), + allocator); + rapidjson::Value segments(rapidjson::kArrayType); + for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { + rapidjson::Value segment(rapidjson::kObjectType); + segment.AddMember("segment_id", rapidjson::Value(seg_id).Move(), allocator); + + auto seg_path = DORIS_TRY(segment_path(seg_id)); + auto index_file_path_prefix = InvertedIndexDescriptor::get_index_file_path_prefix(seg_path); + auto inverted_index_file_reader = std::make_unique( + fs, std::string(index_file_path_prefix), storage_format); + RETURN_IF_ERROR(inverted_index_file_reader->init()); + auto dirs = inverted_index_file_reader->get_all_directories(); + + auto add_file_info_to_json = [&](const std::string& path, + rapidjson::Value& json_value) -> Status { + json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), + allocator); + int64_t idx_file_size = 0; + auto st = fs->file_size(path, &idx_file_size); + if (st != Status::OK()) { + LOG(WARNING) << "show nested index file get file size error, file: " << path + << ", error: " << st.msg(); + return st; + } + json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), + allocator); + return Status::OK(); + }; + + auto process_files = [&allocator, &inverted_index_file_reader]( + auto& index_meta, rapidjson::Value& indices, + rapidjson::Value& index) -> Status { + rapidjson::Value files_value(rapidjson::kArrayType); + std::vector files; + auto ret = inverted_index_file_reader->open(&index_meta); + if (!ret.has_value()) { + LOG(INFO) << "InvertedIndexFileReader open error:" << ret.error(); + return Status::InternalError("InvertedIndexFileReader open error"); + } + using T = std::decay_t; + auto reader = std::forward(ret).value(); + reader->list(&files); + for (auto& file : files) { + rapidjson::Value file_value(rapidjson::kObjectType); + auto size = reader->fileLength(file.c_str()); + file_value.AddMember("name", rapidjson::Value(file.c_str(), allocator), allocator); + file_value.AddMember("size", rapidjson::Value(size).Move(), allocator); + files_value.PushBack(file_value, allocator); + } + index.AddMember("files", files_value, allocator); + indices.PushBack(index, allocator); + return Status::OK(); + }; + + if (storage_format != InvertedIndexStorageFormatPB::V1) { + auto path = InvertedIndexDescriptor::get_index_file_path_v2(index_file_path_prefix); + auto st = add_file_info_to_json(path, segment); + if (!st.ok()) { + return st; + } + rapidjson::Value indices(rapidjson::kArrayType); + for (auto& dir : *dirs) { + rapidjson::Value index(rapidjson::kObjectType); + auto index_id = dir.first.first; + auto index_suffix = dir.first.second; + index.AddMember("index_id", rapidjson::Value(index_id).Move(), allocator); + index.AddMember("index_suffix", rapidjson::Value(index_suffix.c_str(), allocator), + allocator); + + rapidjson::Value files_value(rapidjson::kArrayType); + std::vector files; + doris::TabletIndexPB index_pb; + index_pb.set_index_id(index_id); + index_pb.set_index_suffix_name(index_suffix); + TabletIndex index_meta; + index_meta.init_from_pb(index_pb); + + auto status = process_files(index_meta, indices, index); + if (!status.ok()) { + return status; + } + } + segment.AddMember("indices", indices, allocator); + segments.PushBack(segment, allocator); + } else { + rapidjson::Value indices(rapidjson::kArrayType); + for (auto column : _rowset_meta->tablet_schema()->columns()) { + const auto* index_meta = _rowset_meta->tablet_schema()->get_inverted_index(*column); + if (index_meta == nullptr) { + continue; + } + rapidjson::Value index(rapidjson::kObjectType); + auto index_id = index_meta->index_id(); + auto index_suffix = index_meta->get_index_suffix(); + index.AddMember("index_id", rapidjson::Value(index_id).Move(), allocator); + index.AddMember("index_suffix", rapidjson::Value(index_suffix.c_str(), allocator), + allocator); + auto path = InvertedIndexDescriptor::get_index_file_path_v1(index_file_path_prefix, + index_id, index_suffix); + auto st = add_file_info_to_json(path, index); + if (!st.ok()) { + return st; + } + + auto status = process_files(*index_meta, indices, index); + if (!status.ok()) { + return status; + } + } + segment.AddMember("indices", indices, allocator); + segments.PushBack(segment, allocator); + } + } + rowset_value->AddMember("segments", segments, allocator); + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h index bf7daf8bdfa6f9..52d5ac5c8a8742 100644 --- a/be/src/olap/rowset/beta_rowset.h +++ b/be/src/olap/rowset/beta_rowset.h @@ -84,7 +84,10 @@ class BetaRowset final : public Rowset { [[nodiscard]] virtual Status add_to_binlog() override; - Status calc_local_file_crc(uint32_t* crc_value, int64_t* file_count); + Status calc_file_crc(uint32_t* crc_value, int64_t* file_count); + + Status show_nested_index_file(rapidjson::Value* rowset_value, + rapidjson::Document::AllocatorType& allocator); protected: BetaRowset(const TabletSchemaSPtr& schema, const RowsetMetaSharedPtr& rowset_meta, diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 17801ec16fd9da..4481f3b18c0194 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -303,7 +303,9 @@ Status BetaRowsetWriter::_load_noncompacted_segment(segment_v2::SegmentSharedPtr ? (config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE : io::FileCachePolicy::NO_CACHE) : io::FileCachePolicy::NO_CACHE, - .is_doris_table = true}; + .is_doris_table = true, + .cache_base_path {}, + }; auto s = segment_v2::Segment::open(io::global_local_filesystem(), path, segment_id, rowset_id(), _context.tablet_schema, reader_options, &segment); if (!s.ok()) { @@ -339,7 +341,12 @@ Status BetaRowsetWriter::_find_longest_consecutive_small_segment( if (is_large_segment) { if (segid == _segcompacted_point) { // skip large segments at the front + auto dst_seg_id = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, segid, dst_seg_id); + } continue; } else { // stop because we need consecutive segments @@ -364,7 +371,13 @@ Status BetaRowsetWriter::_find_longest_consecutive_small_segment( } if (s == 1) { // poor bachelor, let it go VLOG_DEBUG << "only one candidate segment"; + auto src_seg_id = _segcompacted_point.load(); + auto dst_seg_id = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, src_seg_id, dst_seg_id); + } segments->clear(); return Status::OK(); } @@ -552,7 +565,7 @@ Status BetaRowsetWriter::_segcompaction_rename_last_segments() { "code: {}", _segcompaction_status.load()); } - if (!_is_segcompacted() || _segcompacted_point == _num_segment) { + if (!is_segcompacted() || _segcompacted_point == _num_segment) { // no need if never segcompact before or all segcompacted return Status::OK(); } @@ -560,7 +573,12 @@ Status BetaRowsetWriter::_segcompaction_rename_last_segments() { // so that transaction can be committed ASAP VLOG_DEBUG << "segcompaction last few segments"; for (int32_t segid = _segcompacted_point; segid < _num_segment; segid++) { + auto dst_segid = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, segid, dst_segid); + } } return Status::OK(); } @@ -680,6 +698,20 @@ Status BetaRowsetWriter::_close_file_writers() { RETURN_NOT_OK_STATUS_WITH_WARN(seg_comp_file_writer->close(), "close segment compaction worker failed"); } + // process delete bitmap for mow table + if (is_segcompacted() && _segcompaction_worker->need_convert_delete_bitmap()) { + auto converted_delete_bitmap = _segcompaction_worker->get_converted_delete_bitmap(); + // which means the segment compaction is triggerd + if (converted_delete_bitmap != nullptr) { + RowsetIdUnorderedSet rowsetids; + rowsetids.insert(rowset_id()); + context().tablet->add_sentinel_mark_to_delete_bitmap(converted_delete_bitmap.get(), + rowsetids); + context().mow_context->delete_bitmap->remove({rowset_id(), 0, 0}, + {rowset_id(), UINT32_MAX, INT64_MAX}); + context().mow_context->delete_bitmap->merge(*converted_delete_bitmap); + } + } } return Status::OK(); } @@ -717,7 +749,7 @@ int64_t BaseBetaRowsetWriter::_num_seg() const { } int64_t BetaRowsetWriter::_num_seg() const { - return _is_segcompacted() ? _num_segcompacted : _num_segment; + return is_segcompacted() ? _num_segcompacted : _num_segment; } // update tablet schema when meet variant columns, before commit_txn @@ -859,10 +891,12 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( writer_options.rowset_ctx = &_context; writer_options.write_type = _context.write_type; writer_options.write_type = DataWriteType::TYPE_COMPACTION; + writer_options.max_rows_per_segment = _context.max_rows_per_segment; + writer_options.mow_ctx = _context.mow_context; - *writer = std::make_unique( - file_writer.get(), _num_segcompacted, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context); + *writer = std::make_unique(file_writer.get(), _num_segcompacted, + _context.tablet_schema, _context.tablet, + _context.data_dir, writer_options); if (auto& seg_writer = _segcompaction_worker->get_file_writer(); seg_writer != nullptr && seg_writer->state() != io::FileWriter::State::CLOSED) { RETURN_IF_ERROR(_segcompaction_worker->get_file_writer()->close()); diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 98bb43c6092620..f8033accfca6b7 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -227,6 +227,8 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { std::unique_ptr* writer, uint64_t index_size, KeyBoundsPB& key_bounds); + bool is_segcompacted() const { return _num_segcompacted > 0; } + private: // segment compaction friend class SegcompactionWorker; @@ -240,7 +242,6 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status _segcompaction_rename_last_segments(); Status _load_noncompacted_segment(segment_v2::SegmentSharedPtr& segment, int32_t segment_id); Status _find_longest_consecutive_small_segment(SegCompactionCandidatesSharedPtr& segments); - bool _is_segcompacted() const { return _num_segcompacted > 0; } bool _check_and_set_is_doing_segcompaction(); Status _rename_compacted_segments(int64_t begin, int64_t end); Status _rename_compacted_segment_plain(uint64_t seg_id); diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 310d0901b2a751..6050a33bfc2f5d 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -169,6 +169,7 @@ class Rowset : public std::enable_shared_from_this { bool is_segments_overlapping() const { return rowset_meta()->is_segments_overlapping(); } KeysType keys_type() { return _schema->keys_type(); } RowsetStatePB rowset_meta_state() const { return rowset_meta()->rowset_state(); } + bool produced_by_compaction() const { return rowset_meta()->produced_by_compaction(); } // remove all files in this rowset // TODO should we rename the method to remove_files() to be more specific? diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index aa20b5b1ef13ac..c5a573d760c305 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -255,6 +255,12 @@ class RowsetMeta { return num_segments() > 1 && is_singleton_delta() && segments_overlap() != NONOVERLAPPING; } + bool produced_by_compaction() const { + return has_version() && + (start_version() < end_version() || + (start_version() == end_version() && segments_overlap() == NONOVERLAPPING)); + } + // get the compaction score of this rowset. // if segments are overlapping, the score equals to the number of segments, // otherwise, score is 1. diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index 95f2a945134b4c..374056f7b9dd96 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -76,11 +76,14 @@ Status SegcompactionWorker::_get_segcompaction_reader( std::vector& return_columns, std::unique_ptr* reader) { const auto& ctx = _writer->_context; + bool record_rowids = need_convert_delete_bitmap() && is_key; StorageReadOptions read_options; read_options.stats = stat; read_options.use_page_cache = false; read_options.tablet_schema = ctx.tablet_schema; + read_options.record_rowids = record_rowids; std::vector> seg_iterators; + std::map segment_rows; for (auto& seg_ptr : *segments) { std::unique_ptr iter; auto s = seg_ptr->new_iterator(schema, read_options, &iter); @@ -89,6 +92,10 @@ Status SegcompactionWorker::_get_segcompaction_reader( s.to_string()); } seg_iterators.push_back(std::move(iter)); + segment_rows.emplace(seg_ptr->id(), seg_ptr->num_rows()); + } + if (record_rowids && _rowid_conversion != nullptr) { + _rowid_conversion->reset_segment_map(segment_rows); } *reader = std::make_unique(&row_sources_buf); @@ -101,6 +108,8 @@ Status SegcompactionWorker::_get_segcompaction_reader( reader_params.tablet = tablet; reader_params.return_columns = return_columns; reader_params.is_key_column_group = is_key; + reader_params.use_page_cache = false; + reader_params.record_rowids = record_rowids; return (*reader)->init(reader_params, nullptr); } @@ -234,6 +243,9 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt DCHECK(ctx.tablet); auto tablet = std::static_pointer_cast(ctx.tablet); + if (need_convert_delete_bitmap() && _rowid_conversion == nullptr) { + _rowid_conversion = std::make_unique(_writer->rowset_id()); + } std::vector> column_groups; Merger::vertical_split_columns(*ctx.tablet_schema, &column_groups); @@ -264,8 +276,8 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt Merger::Statistics merger_stats; RETURN_IF_ERROR(Merger::vertical_compact_one_group( tablet->tablet_id(), ReaderType::READER_SEGMENT_COMPACTION, *ctx.tablet_schema, - is_key, column_ids, &row_sources_buf, *reader, *writer, INT_MAX, &merger_stats, - &index_size, key_bounds)); + is_key, column_ids, &row_sources_buf, *reader, *writer, &merger_stats, &index_size, + key_bounds, _rowid_conversion.get())); total_index_size += index_size; if (is_key) { RETURN_IF_ERROR(row_sources_buf.flush()); @@ -291,6 +303,10 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt } RETURN_IF_ERROR(_delete_original_segments(begin, end)); + if (_rowid_conversion != nullptr) { + convert_segment_delete_bitmap(ctx.mow_context->delete_bitmap, begin, end, + _writer->_num_segcompacted); + } RETURN_IF_ERROR(_writer->_rename_compacted_segments(begin, end)); if (VLOG_DEBUG_IS_ON) { @@ -351,6 +367,59 @@ void SegcompactionWorker::compact_segments(SegCompactionCandidatesSharedPtr segm _is_compacting_state_mutable = true; } +bool SegcompactionWorker::need_convert_delete_bitmap() { + if (_writer == nullptr) { + return false; + } + auto tablet = _writer->context().tablet; + return tablet != nullptr && tablet->keys_type() == KeysType::UNIQUE_KEYS && + tablet->enable_unique_key_merge_on_write() && + tablet->tablet_schema()->has_sequence_col(); +} + +void SegcompactionWorker::convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, + uint32_t src_seg_id, uint32_t dest_seg_id) { + // lazy init + if (nullptr == _converted_delete_bitmap) { + _converted_delete_bitmap = std::make_shared(_writer->context().tablet_id); + } + auto rowset_id = _writer->context().rowset_id; + const auto* seg_map = + src_delete_bitmap->get({rowset_id, src_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}); + if (seg_map != nullptr) { + _converted_delete_bitmap->set({rowset_id, dest_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}, + *seg_map); + } +} + +void SegcompactionWorker::convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, + uint32_t src_begin, uint32_t src_end, + uint32_t dst_seg_id) { + // lazy init + if (nullptr == _converted_delete_bitmap) { + _converted_delete_bitmap = std::make_shared(_writer->context().tablet_id); + } + auto rowset_id = _writer->context().rowset_id; + RowLocation src(rowset_id, 0, 0); + for (uint32_t seg_id = src_begin; seg_id <= src_end; seg_id++) { + const auto* seg_map = + src_delete_bitmap->get({rowset_id, seg_id, DeleteBitmap::TEMP_VERSION_COMMON}); + if (!seg_map) { + continue; + } + src.segment_id = seg_id; + for (unsigned int row_id : *seg_map) { + src.row_id = row_id; + auto dst_row_id = _rowid_conversion->get(src); + if (dst_row_id < 0) { + continue; + } + _converted_delete_bitmap->add( + {rowset_id, dst_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}, dst_row_id); + } + } +} + bool SegcompactionWorker::cancel() { // return true if the task is canncellable (actual compaction is not started) // return false when the task is not cancellable (it is in the middle of segcompaction) diff --git a/be/src/olap/rowset/segcompaction.h b/be/src/olap/rowset/segcompaction.h index 5aef89992d30b8..67dd6889aadd72 100644 --- a/be/src/olap/rowset/segcompaction.h +++ b/be/src/olap/rowset/segcompaction.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" #include "olap/merger.h" +#include "olap/simple_rowid_conversion.h" #include "olap/tablet.h" #include "segment_v2/segment.h" @@ -51,6 +52,14 @@ class SegcompactionWorker { void compact_segments(SegCompactionCandidatesSharedPtr segments); + bool need_convert_delete_bitmap(); + + void convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, uint32_t src_seg_id, + uint32_t dest_seg_id); + void convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, uint32_t src_begin, + uint32_t src_end, uint32_t dest_seg_id); + DeleteBitmapPtr get_converted_delete_bitmap() { return _converted_delete_bitmap; } + io::FileWriterPtr& get_file_writer() { return _file_writer; } // set the cancel flag, tasks already started will not be cancelled. @@ -78,6 +87,10 @@ class SegcompactionWorker { BetaRowsetWriter* _writer = nullptr; io::FileWriterPtr _file_writer; + // for unique key mow table + std::unique_ptr _rowid_conversion; + DeleteBitmapPtr _converted_delete_bitmap; + // the state is not mutable when 1)actual compaction operation started or 2) cancelled std::atomic _is_compacting_state_mutable = true; }; diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index 82313f988cbb2b..40f6e8303fe370 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -150,14 +150,15 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptr( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context, - std::move(inverted_file_writer)); + _context.data_dir, writer_options, std::move(inverted_file_writer)); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); auto s = writer->init(); if (!s.ok()) { @@ -187,14 +188,14 @@ Status SegmentFlusher::_create_segment_writer( writer_options.enable_unique_key_merge_on_write = _context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &_context; writer_options.write_type = _context.write_type; + writer_options.mow_ctx = _context.mow_context; if (no_compression) { writer_options.compression_type = NO_COMPRESSION; } writer = std::make_unique( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context, - std::move(inverted_file_writer)); + _context.data_dir, writer_options, std::move(inverted_file_writer)); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); auto s = writer->init(); if (!s.ok()) { diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp index 7ad20f210c2c86..d4b0c5fff1b78c 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -34,6 +34,15 @@ #undef bshuf_compress_lz4 #undef bshuf_decompress_lz4 +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_neon +#define bshuf_compress_lz4 bshuf_compress_lz4_neon +#define bshuf_decompress_lz4 bshuf_decompress_lz4_neon +#include // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + using base::CPU; namespace doris { @@ -63,6 +72,10 @@ __attribute__((constructor)) void SelectBitshuffleFunctions() { g_bshuf_compress_lz4 = bshuf_compress_lz4; g_bshuf_decompress_lz4 = bshuf_decompress_lz4; } +#elif defined(__ARM_NEON) && defined(__aarch64__) && !defined(__APPLE__) + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_neon; + g_bshuf_compress_lz4 = bshuf_compress_lz4_neon; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_neon; #else g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; g_bshuf_compress_lz4 = bshuf_compress_lz4; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index d0f2830712da20..2891e8aaa129fe 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -1601,6 +1601,9 @@ Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColum if (obj.is_null_root()) { obj.create_root(); } + if (!obj.is_finalized()) { + obj.finalize(); + } auto root_column = obj.get_root(); RETURN_IF_ERROR(_inner_iter->next_batch(n, root_column, has_null)); obj.incr_num_rows(*n); @@ -1634,6 +1637,9 @@ Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const si if (obj.is_null_root()) { obj.create_root(); } + if (!obj.is_finalized()) { + obj.finalize(); + } auto root_column = obj.get_root(); RETURN_IF_ERROR(_inner_iter->read_by_rowids(rowids, count, root_column)); obj.incr_num_rows(count); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 30b7e3b37508f8..3b0cdad5133f66 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -91,7 +91,7 @@ struct ColumnIteratorOptions { // for page cache allocation // page types are divided into DATA_PAGE & INDEX_PAGE // INDEX_PAGE including index_page, dict_page and short_key_page - PageTypePB type; + PageTypePB type = PageTypePB::UNKNOWN_PAGE_TYPE; io::FileReader* file_reader = nullptr; // Ref // reader statistics OlapReaderStatistics* stats = nullptr; // Ref diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index 428dc05e6f6aa5..ec1b5bdd9e4d35 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -31,7 +31,9 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, const TQueryOptions& query_options) - : _searcher(searcher), _query(std::make_unique()) {} + : _searcher(searcher), + _query(std::make_unique()), + _max_expansions(query_options.inverted_index_max_expansions) {} void PhraseEdgeQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) { } void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { - size_t count = 0; + bool first = true; std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]); - find_words([this, &count, &sub_term, &roaring](Term* term) { + find_words([this, &first, &sub_term, &roaring](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); if (ws_term.find(sub_term) == std::wstring::npos) { return; @@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { } _CLDELETE(term_doc); - if (count) { + if (!first) { roaring.swap(result); + first = false; } else { roaring |= result; } - count++; }); } @@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring& roaring) { std::vector suffix_terms; std::vector prefix_terms; - find_words([&suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { + find_words([this, &suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); - if (ws_term.ends_with(suffix_term)) { - suffix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) { + if (ws_term.ends_with(suffix_term)) { + suffix_terms.push_back(_CL_POINTER(term)); + } } - if (ws_term.starts_with(prefix_term)) { - prefix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) { + if (ws_term.starts_with(prefix_term)) { + prefix_terms.push_back(_CL_POINTER(term)); + } } }); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 823f46285b1d00..5daf382e0d08fa 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -52,6 +52,7 @@ class PhraseEdgeQuery : public Query { std::wstring _field_name; std::vector _terms; std::unique_ptr _query; + int32_t _max_expansions = 50; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index d89d089de3bf0f..02339c000806ea 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -187,8 +187,10 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r } } -Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, +Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, + InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); lucene::store::IndexInput* null_bitmap_in = nullptr; bool owned_dir = false; try { @@ -244,9 +246,11 @@ Status InvertedIndexReader::handle_searcher_cache( InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, inverted_index_cache_handle)) { + stats->inverted_index_searcher_cache_hit++; return Status::OK(); } else { // searcher cache miss + stats->inverted_index_searcher_cache_miss++; auto mem_tracker = std::make_unique("InvertedIndexSearcherCacheWithRead"); SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; @@ -256,7 +260,7 @@ Status InvertedIndexReader::handle_searcher_cache( // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(&null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -284,6 +288,27 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, return Status::OK(); }; +Status InvertedIndexReader::match_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + if (!query) { + return Status::Error( + "query type " + query_type_to_string(query_type) + ", query is nullptr"); + } + query->add(query_info); + query->search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); @@ -384,27 +409,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } } -Status FullTextIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap) { - TQueryOptions queryOptions = runtime_state->query_options(); - try { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); - if (!query) { - return Status::Error( - "query type " + query_type_to_string(query_type) + ", query is nullptr"); - } - query->add(query_info); - query->search(*term_match_bitmap); - } catch (const CLuceneError& e) { - return Status::Error("CLuceneError occured: {}", - e.what()); - } - return Status::OK(); -} - InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } @@ -461,28 +465,25 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, std::string search_str(search_query->data, act_len); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); - std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); - // unique_ptr with custom deleter - std::unique_ptr term { - _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), - [](lucene::index::Term* term) { _CLDECDELETE(term); }}; - std::unique_ptr query; auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); - // try to get query bitmap result from cache and return immediately on cache hit InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } - roaring::Roaring result; + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); + + InvertedIndexQueryInfo query_info; + query_info.field_name = column_name_ws; + query_info.terms.emplace_back(search_str); + + auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); @@ -494,33 +495,29 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr)->_search(query.get(), [&result](DocRange* doc_range) { - if (doc_range->type_ == DocRangeType::kMany) { - result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); - } else { - result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); - } - }); + RETURN_IF_ERROR(match_index_search(stats, runtime_state, + InvertedIndexQueryType::MATCH_ANY_QUERY, + query_info, *searcher_ptr, result)); break; } - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr) - ->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + *searcher_ptr, result)); break; } - case InvertedIndexQueryType::LESS_THAN_QUERY: case InvertedIndexQueryType::LESS_EQUAL_QUERY: case InvertedIndexQueryType::GREATER_THAN_QUERY: case InvertedIndexQueryType::GREATER_EQUAL_QUERY: { + std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + std::unique_ptr query; + bool include_upper = query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY; bool include_lower = query_type == InvertedIndexQueryType::GREATER_EQUAL_QUERY; @@ -537,7 +534,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, (*searcher_ptr) ->_search(query.get(), [&result](const int32_t docid, const float_t /*score*/) { - result.add(docid); + result->add(docid); }); break; } @@ -560,12 +557,10 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, } // add to cache - std::shared_ptr term_match_bitmap = - std::make_shared(result); - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handler); + result->runOptimize(); + cache->insert(cache_key, result, &cache_handler); - bit_map = term_match_bitmap; + bit_map = result; } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 90df92f0728a9b..2377a91845fc4d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -73,7 +73,6 @@ class InvertedIndexIterator; class InvertedIndexQueryCacheHandle; class InvertedIndexFileReader; struct InvertedIndexQueryInfo; - class InvertedIndexReader : public std::enable_shared_from_this { public: explicit InvertedIndexReader( @@ -94,7 +93,8 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); + friend class InvertedIndexIterator; std::shared_ptr _inverted_index_file_reader; TabletIndex _index_meta; bool _has_null = true; }; +using InvertedIndexReaderPtr = std::shared_ptr; class FullTextIndexReader : public InvertedIndexReader { ENABLE_FACTORY_CREATOR(FullTextIndexReader); @@ -177,13 +184,6 @@ class FullTextIndexReader : public InvertedIndexReader { const std::map& properties); static void setup_analyzer_use_stopwords(std::unique_ptr& analyzer, const std::map& properties); - -private: - Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, - InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, - const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { @@ -373,13 +373,15 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(cache_handle, dir); + return _reader->read_null_bitmap(_stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; [[nodiscard]] const std::map& get_index_properties() const; [[nodiscard]] bool has_null() { return _reader->has_null(); }; + const InvertedIndexReaderPtr& reader() { return _reader; } + private: OlapReaderStatistics* _stats = nullptr; RuntimeState* _runtime_state = nullptr; diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h index 19041f4c51d1db..93ec03df452b6c 100644 --- a/be/src/olap/rowset/segment_v2/options.h +++ b/be/src/olap/rowset/segment_v2/options.h @@ -24,6 +24,8 @@ namespace segment_v2 { static constexpr size_t DEFAULT_PAGE_SIZE = 1024 * 1024; // default size: 1M +constexpr long ROW_STORE_PAGE_SIZE_DEFAULT_VALUE = 16384; // default row store page size: 16KB + struct PageBuilderOptions { size_t data_page_size = DEFAULT_PAGE_SIZE; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index ea5b5ae01b6bb9..0473ff128fc5eb 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -315,6 +315,7 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { for (auto& expr : _remaining_conjunct_roots) { _calculate_pred_in_remaining_conjunct_root(expr); } + _calculate_func_in_remaining_conjunct_root(); _column_predicate_info.reset(new ColumnPredicateInfo()); if (_schema->rowid_col_idx() > 0) { @@ -560,6 +561,7 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() { } } _col_preds_except_leafnode_of_andnode.clear(); + compound_func_exprs.clear(); // 1. if all conditions in the compound hit the inverted index and there are no other expr to handle. // 2. then there is no need to generate index_result_column. if (_enable_common_expr_pushdown && _remaining_conjunct_roots.empty()) { @@ -697,6 +699,11 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { RowRanges dict_row_ranges = RowRanges::create_single(num_rows()); for (auto cid : cids) { + if (!_segment->can_apply_predicate_safely(cid, + _opts.col_id_to_predicates.at(cid).get(), + *_schema, _opts.io_ctx.reader_type)) { + continue; + } RowRanges tmp_row_ranges = RowRanges::create_single(num_rows()); DCHECK(_opts.col_id_to_predicates.count(cid) > 0); RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict( @@ -809,25 +816,32 @@ Status SegmentIterator::_execute_predicates_except_leafnode_of_andnode( auto v_literal_expr = std::dynamic_pointer_cast(expr); _column_predicate_info->query_values.insert(v_literal_expr->value()); } else if (node_type == TExprNodeType::BINARY_PRED || node_type == TExprNodeType::MATCH_PRED || - node_type == TExprNodeType::IN_PRED) { - if (node_type == TExprNodeType::MATCH_PRED) { - _column_predicate_info->query_op = "match"; - } else if (node_type == TExprNodeType::IN_PRED) { - if (expr->op() == TExprOpcode::type::FILTER_IN) { - _column_predicate_info->query_op = "in"; + node_type == TExprNodeType::IN_PRED || node_type == TExprNodeType::FUNCTION_CALL) { + std::string result_sign; + if (node_type == TExprNodeType::FUNCTION_CALL) { + result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + } else { + if (node_type == TExprNodeType::MATCH_PRED) { + _column_predicate_info->query_op = "match"; + } else if (node_type == TExprNodeType::IN_PRED) { + if (expr->op() == TExprOpcode::type::FILTER_IN) { + _column_predicate_info->query_op = "in"; + } else { + _column_predicate_info->query_op = "not_in"; + } } else { - _column_predicate_info->query_op = "not_in"; + _column_predicate_info->query_op = expr->fn().name.function_name; } - } else { - _column_predicate_info->query_op = expr->fn().name.function_name; + result_sign = _gen_predicate_result_sign(_column_predicate_info.get()); } + // get child condition result in compound conditions - auto pred_result_sign = _gen_predicate_result_sign(_column_predicate_info.get()); _column_predicate_info.reset(new ColumnPredicateInfo()); - VLOG_DEBUG << "_gen_predicate_result_sign " << pred_result_sign; - if (_rowid_result_for_index.count(pred_result_sign) > 0 && - _rowid_result_for_index[pred_result_sign].first) { - auto apply_result = _rowid_result_for_index[pred_result_sign].second; + VLOG_DEBUG << "result_sign " << result_sign; + if (_rowid_result_for_index.count(result_sign) > 0 && + _rowid_result_for_index[result_sign].first) { + auto apply_result = _rowid_result_for_index[result_sign].second; _pred_except_leafnode_of_andnode_evaluate_result.push_back(apply_result); } } else if (node_type == TExprNodeType::COMPOUND_PRED) { @@ -871,7 +885,7 @@ Status SegmentIterator::_execute_compound_fn(const std::string& function_name) { bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() { // no compound predicates push down, so no need to filter - if (_col_preds_except_leafnode_of_andnode.size() == 0) { + if (_col_preds_except_leafnode_of_andnode.empty() && compound_func_exprs.empty()) { return false; } for (auto pred : _col_preds_except_leafnode_of_andnode) { @@ -885,6 +899,14 @@ bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() { return false; } } + for (const auto& func_expr_pair : compound_func_exprs) { + const auto& expr = func_expr_pair.first; + std::string pred_result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + if (!_rowid_result_for_index.contains(pred_result_sign)) { + return false; + } + } return true; } @@ -903,7 +925,8 @@ bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_inverted_index_query) { return false; } - if (_inverted_index_iterators[pred->column_id()] == nullptr) { + auto pred_column_id = pred->column_id(); + if (_inverted_index_iterators[pred_column_id] == nullptr) { //this column without inverted index return false; } @@ -918,13 +941,21 @@ bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool return false; } + // UNTOKENIZED strings exceed ignore_above, they are written as null, causing range query errors + if (PredicateTypeTraits::is_range(pred->type()) && + _inverted_index_iterators[pred_column_id] != nullptr && + _inverted_index_iterators[pred_column_id]->get_inverted_index_reader_type() == + InvertedIndexReaderType::STRING_TYPE) { + return false; + } + // Function filter no apply inverted index if (dynamic_cast*>(pred) != nullptr || dynamic_cast*>(pred) != nullptr) { return false; } - bool handle_by_fulltext = _column_has_fulltext_index(pred->column_id()); + bool handle_by_fulltext = _column_has_fulltext_index(pred_column_id); if (handle_by_fulltext) { // when predicate in compound condition which except leafNode of andNode, // only can apply match query for fulltext index, @@ -992,11 +1023,23 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { } } + for (const auto& func_expr_pair : compound_func_exprs) { + const auto& expr = func_expr_pair.first; + const auto& expr_ctx = func_expr_pair.second; + auto result = std::make_shared(); + RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + std::string result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + _rowid_result_for_index.emplace(result_sign, std::make_pair(true, std::move(*result))); + } + return Status::OK(); } bool SegmentIterator::_downgrade_without_index(Status res, bool need_remaining) { - if (res.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND || + bool is_fallback = + _opts.runtime_state->query_options().enable_fallback_on_missing_inverted_index; + if ((res.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND && is_fallback) || res.code() == ErrorCode::INVERTED_INDEX_BYPASS || res.code() == ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED || (res.code() == ErrorCode::INVERTED_INDEX_NO_TERMS && need_remaining)) { @@ -1246,18 +1289,6 @@ Status SegmentIterator::_apply_inverted_index() { std::vector remaining_predicates; std::set no_need_to_pass_column_predicate_set; - // TODO:Comment out this code before introducing range query functionality - /*for (const auto& entry : _opts.col_id_to_predicates) { - ColumnId column_id = entry.first; - auto pred = entry.second; - bool continue_apply = true; - RETURN_IF_ERROR(_apply_inverted_index_on_block_column_predicate( - column_id, pred.get(), no_need_to_pass_column_predicate_set, &continue_apply)); - if (!continue_apply) { - break; - } - }*/ - for (auto pred : _col_predicates) { if (no_need_to_pass_column_predicate_set.count(pred) > 0) { continue; @@ -1293,6 +1324,23 @@ Status SegmentIterator::_apply_inverted_index() { } } + for (const auto& func_expr_pair : no_compound_func_exprs) { + const auto& expr = func_expr_pair.first; + const auto& expr_ctx = func_expr_pair.second; + auto result = std::make_shared(); + RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + _row_bitmap &= *result; + for (auto it = _remaining_conjunct_roots.begin(); it != _remaining_conjunct_roots.end();) { + if (*it == expr) { + std::erase_if(_common_expr_ctxs_push_down, + [&it](const auto& iter) { return iter->root() == *it; }); + it = _remaining_conjunct_roots.erase(it); + } else { + ++it; + } + } + } + // add a switch for inverted index filter if (_opts.runtime_state && _opts.runtime_state->enable_common_expr_pushdown_for_inverted_index()) { @@ -1345,19 +1393,15 @@ Status SegmentIterator::_apply_inverted_index() { return Status::OK(); } -bool SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId cid) { +bool SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId cid, + bool default_return) { auto it = _column_predicate_inverted_index_status.find(cid); if (it != _column_predicate_inverted_index_status.end()) { const auto& pred_map = it->second; - - bool all_true = std::all_of(pred_map.begin(), pred_map.end(), - [](const auto& pred_entry) { return pred_entry.second; }); - - if (all_true) { - return true; - } + return std::all_of(pred_map.begin(), pred_map.end(), + [](const auto& pred_entry) { return pred_entry.second; }); } - return false; + return default_return; } Status SegmentIterator::_init_return_column_iterators() { @@ -1431,6 +1475,18 @@ Status SegmentIterator::_init_inverted_index_iterators() { return Status::OK(); } +Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) { + if (_inverted_index_iterators[cid] == nullptr) { + return _init_single_inverted_index_iterator.call([&] { + return _segment->new_inverted_index_iterator( + _opts.tablet_schema->column(cid), + _segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)), + _opts, &_inverted_index_iterators[cid]); + }); + } + return Status::OK(); +} + Status SegmentIterator::_lookup_ordinal(const RowCursor& key, bool is_include, rowid_t upper_bound, rowid_t* rowid) { if (_segment->_tablet_schema->keys_type() == UNIQUE_KEYS && @@ -1545,7 +1601,8 @@ Status SegmentIterator::_lookup_ordinal_from_pk_index(const RowCursor& key, bool // for mow with cluster key table, we should get key range from short key index. DCHECK(_segment->_tablet_schema->cluster_key_idxes().empty()); - if (has_seq_col) { + // if full key is exact_match, the primary key without sequence column should also the same + if (has_seq_col && !exact_match) { size_t seq_col_length = _segment->_tablet_schema->column(_segment->_tablet_schema->sequence_col_idx()) .length() + @@ -1925,7 +1982,8 @@ Status SegmentIterator::_read_columns(const std::vector& column_ids, } Status SegmentIterator::_init_current_block( - vectorized::Block* block, std::vector& current_columns) { + vectorized::Block* block, std::vector& current_columns, + uint32_t nrows_read_limit) { block->clear_column_data(_schema->num_column_ids()); for (size_t i = 0; i < _schema->num_column_ids(); i++) { @@ -1945,7 +2003,7 @@ Status SegmentIterator::_init_current_block( column_desc->path() == nullptr ? "" : column_desc->path()->get_path()); // TODO reuse current_columns[cid] = file_column_type->create_column(); - current_columns[cid]->reserve(_opts.block_row_max); + current_columns[cid]->reserve(nrows_read_limit); } else { // the column in block must clear() here to insert new data if (_is_pred_column[cid] || @@ -1964,7 +2022,7 @@ Status SegmentIterator::_init_current_block( } else if (column_desc->type() == FieldType::OLAP_FIELD_TYPE_DATETIME) { current_columns[cid]->set_datetime_type(); } - current_columns[cid]->reserve(_opts.block_row_max); + current_columns[cid]->reserve(nrows_read_limit); } } } @@ -2025,11 +2083,12 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default( "segment_iterator._read_columns_by_index", "column_name", ""); if (debug_col_name.empty()) { - return Status::Error("{} does not need to read data"); + return Status::Error("does not need to read data"); } auto col_name = _opts.tablet_schema->column(cid).name(); if (debug_col_name.find(col_name) != std::string::npos) { - return Status::Error("{} does not need to read data"); + return Status::Error("does not need to read data, {}", + debug_col_name); } }) @@ -2224,9 +2283,27 @@ Status SegmentIterator::_read_columns_by_rowids(std::vector& read_colu } for (auto cid : read_column_ids) { - if (_prune_column(cid, (*mutable_columns)[cid], true, select_size)) { + auto& colunm = (*mutable_columns)[cid]; + if (_no_need_read_key_data(cid, colunm, select_size)) { continue; } + if (_prune_column(cid, colunm, true, select_size)) { + continue; + } + + DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", { + auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default( + "segment_iterator._read_columns_by_index", "column_name", ""); + if (debug_col_name.empty()) { + return Status::Error("does not need to read data"); + } + auto col_name = _opts.tablet_schema->column(cid).name(); + if (debug_col_name.find(col_name) != std::string::npos) { + return Status::Error("does not need to read data, {}", + debug_col_name); + } + }) + RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids(rowids.data(), select_size, _current_return_columns[cid])); } @@ -2378,14 +2455,23 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { } } } - RETURN_IF_ERROR(_init_current_block(block, _current_return_columns)); - _converted_column_ids.assign(_schema->columns().size(), 0); - _current_batch_rows_read = 0; uint32_t nrows_read_limit = _opts.block_row_max; if (_can_opt_topn_reads()) { nrows_read_limit = std::min(static_cast(_opts.topn_limit), nrows_read_limit); } + + DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", { + if (nrows_read_limit != 1) { + return Status::Error("topn opt 1 execute failed: {}", + nrows_read_limit); + } + }) + + RETURN_IF_ERROR(_init_current_block(block, _current_return_columns, nrows_read_limit)); + _converted_column_ids.assign(_schema->columns().size(), 0); + + _current_batch_rows_read = 0; RETURN_IF_ERROR(_read_columns_by_index( nrows_read_limit, _current_batch_rows_read, _lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval)); @@ -2808,6 +2894,11 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root( } else if (_is_literal_node(node_type)) { auto v_literal_expr = static_cast(expr.get()); _column_predicate_info->query_values.insert(v_literal_expr->value()); + } else if (node_type == TExprNodeType::NULL_LITERAL) { + if (!_column_predicate_info->column_name.empty()) { + auto v_literal_expr = static_cast(expr.get()); + _column_predicate_info->query_values.insert(v_literal_expr->value()); + } } else { if (node_type == TExprNodeType::MATCH_PRED) { _column_predicate_info->query_op = "match"; @@ -2829,12 +2920,73 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root( } } +void SegmentIterator::_calculate_func_in_remaining_conjunct_root() { + auto hash = [](const vectorized::VExprSPtr& expr) -> std::size_t { + return std::hash()(expr->expr_name()); + }; + auto equal = [](const vectorized::VExprSPtr& lhs, const vectorized::VExprSPtr& rhs) -> bool { + return lhs->equals(*rhs); + }; + + uint32_t next_id = 0; + std::unordered_map unique_map( + 0, hash, equal); + + auto gen_func_unique_id = [&unique_map, &next_id](const vectorized::VExprSPtr& expr) { + auto it = unique_map.find(expr); + if (it != unique_map.end()) { + return it->second; + } else { + unique_map[expr] = ++next_id; + return next_id; + } + }; + + for (const auto& root_expr_ctx : _common_expr_ctxs_push_down) { + const auto& root_expr = root_expr_ctx->root(); + if (root_expr == nullptr) { + continue; + } + + std::stack> stack; + stack.emplace(root_expr, false); + + while (!stack.empty()) { + const auto& [expr, has_compound_pred] = stack.top(); + stack.pop(); + + bool current_has_compound_pred = + has_compound_pred || (expr->node_type() == TExprNodeType::COMPOUND_PRED); + + if (expr->node_type() == TExprNodeType::FUNCTION_CALL && + expr->can_push_down_to_index()) { + expr->set_index_unique_id(gen_func_unique_id(expr)); + if (current_has_compound_pred) { + compound_func_exprs.emplace_back(expr, root_expr_ctx); + } else { + no_compound_func_exprs.emplace_back(expr, root_expr_ctx); + } + } + + const auto& children = expr->children(); + for (int32_t i = children.size() - 1; i >= 0; --i) { + if (!children[i]->children().empty()) { + stack.emplace(children[i], current_has_compound_pred); + } + } + } + } +} + bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column, size_t nrows_read) { if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_no_need_read_data_opt) { return false; } - if (_opts.tablet_schema->keys_type() != KeysType::DUP_KEYS) { + + if (!((_opts.tablet_schema->keys_type() == KeysType::DUP_KEYS || + (_opts.tablet_schema->keys_type() == KeysType::UNIQUE_KEYS && + _opts.enable_unique_key_merge_on_write)))) { return false; } @@ -2850,19 +3002,7 @@ bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableCo return false; } - std::set cids; - for (auto* pred : _col_predicates) { - cids.insert(pred->column_id()); - } - for (auto* pred : _col_preds_except_leafnode_of_andnode) { - cids.insert(pred->column_id()); - } - - // If the key is present in expr, data needs to be read. - if (cids.contains(cid)) { - return false; - } - if (_column_pred_in_remaining_vconjunct.contains(_opts.tablet_schema->column(cid).name())) { + if (!_check_all_predicates_passed_inverted_index_for_column(cid, true)) { return false; } @@ -2883,7 +3023,7 @@ bool SegmentIterator::_has_delete_predicate(ColumnId cid) { return delete_columns_set.contains(cid); } -bool SegmentIterator::_can_opt_topn_reads() const { +bool SegmentIterator::_can_opt_topn_reads() { if (_opts.topn_limit <= 0) { return false; } @@ -2892,11 +3032,43 @@ bool SegmentIterator::_can_opt_topn_reads() const { return false; } - if (!_col_predicates.empty() || !_col_preds_except_leafnode_of_andnode.empty()) { + bool all_true = std::ranges::all_of(_schema->column_ids(), [this](auto cid) { + if (cid == _opts.tablet_schema->delete_sign_idx() || + _opts.tablet_schema->column(cid).is_key()) { + return true; + } + if (_check_all_predicates_passed_inverted_index_for_column(cid, true)) { + return true; + } return false; + }); + + DBUG_EXECUTE_IF("segment_iterator.topn_opt_2", { + if (all_true) { + return Status::Error("topn opt 2 execute failed"); + } + }) + + return all_true; +} + +Status SegmentIterator::execute_func_expr(const vectorized::VExprSPtr& expr, + const vectorized::VExprContextSPtr& expr_ctx, + std::shared_ptr& result) { + const auto& expr0 = expr->get_child(0); + if (!expr0 || expr0->node_type() != TExprNodeType::SLOT_REF) { + return Status::RuntimeError("cannot perform index filtering"); } - return true; + FuncExprParams params; + auto slot_expr = std::static_pointer_cast(expr0); + params._column_id = _schema->column_id(slot_expr->column_id()); + params._unique_id = _schema->unique_id(slot_expr->column_id()); + params._column_name = _opts.tablet_schema->column(params._column_id).name(); + params._segment_iterator = this; + params.result = result; + + return expr->eval_inverted_index(expr_ctx.get(), params); } } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index ae865ddc456950..f163376d95fce4 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -107,6 +107,15 @@ struct ColumnPredicateInfo { int32_t column_id; }; +class SegmentIterator; +struct FuncExprParams { + ColumnId _column_id = 0; + uint32_t _unique_id = 0; + std::string _column_name; + SegmentIterator* _segment_iterator = nullptr; + std::shared_ptr result; +}; + class SegmentIterator : public RowwiseIterator { public: SegmentIterator(std::shared_ptr segment, SchemaSPtr schema); @@ -123,6 +132,8 @@ class SegmentIterator : public RowwiseIterator { std::vector* block_row_locations) override; const Schema& schema() const override { return *_schema; } + Segment& segment() { return *_segment; } + StorageReadOptions& storage_read_options() { return _opts; } bool is_lazy_materialization_read() const override { return _lazy_materialization_read; } uint64_t data_id() const override { return _segment->id(); } RowsetId rowset_id() const { return _segment->rowset_id(); } @@ -142,6 +153,11 @@ class SegmentIterator : public RowwiseIterator { return updated; } + std::vector>& inverted_index_iterators() { + return _inverted_index_iterators; + } + [[nodiscard]] Status _init_inverted_index_iterators(ColumnId cid); + private: Status _next_batch_internal(vectorized::Block* block); @@ -221,7 +237,8 @@ class SegmentIterator : public RowwiseIterator { bool set_block_rowid); void _replace_version_col(size_t num_rows); Status _init_current_block(vectorized::Block* block, - std::vector& non_pred_vector); + std::vector& non_pred_vector, + uint32_t nrows_read_limit); uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); uint16_t _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); void _output_non_pred_columns(vectorized::Block* block); @@ -309,6 +326,7 @@ class SegmentIterator : public RowwiseIterator { bool _check_column_pred_all_push_down(const std::string& column_name, bool in_compound = false, bool is_match = false); void _calculate_pred_in_remaining_conjunct_root(const vectorized::VExprSPtr& expr); + void _calculate_func_in_remaining_conjunct_root(); // todo(wb) remove this method after RowCursor is removed void _convert_rowcursor_to_short_key(const RowCursor& key, size_t num_keys) { @@ -386,10 +404,15 @@ class SegmentIterator : public RowwiseIterator { bool _has_delete_predicate(ColumnId cid); - bool _can_opt_topn_reads() const; + bool _can_opt_topn_reads(); void _initialize_predicate_results(); - bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid); + bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid, + bool default_return = false); + + Status execute_func_expr(const vectorized::VExprSPtr& expr, + const vectorized::VExprContextSPtr& expr_ctx, + std::shared_ptr& result); class BitmapRangeIterator; class BackwardBitmapRangeIterator; @@ -457,6 +480,11 @@ class SegmentIterator : public RowwiseIterator { // make a copy of `_opts.column_predicates` in order to make local changes std::vector _col_predicates; std::vector _col_preds_except_leafnode_of_andnode; + + using FuncExprPair = std::pair; + std::vector no_compound_func_exprs; + std::vector compound_func_exprs; + vectorized::VExprContextSPtrs _common_expr_ctxs_push_down; bool _enable_common_expr_pushdown = false; std::vector _remaining_conjunct_roots; @@ -503,6 +531,8 @@ class SegmentIterator : public RowwiseIterator { std::unordered_map> _column_predicate_inverted_index_status; + + DorisCallOnce _init_single_inverted_index_iterator; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index d22e1060dd370b..f21c5fcbab2c88 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -21,11 +21,6 @@ #include #include -#include -#include -#include -#include - // IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep @@ -82,22 +77,22 @@ using namespace ErrorCode; const char* k_segment_magic = "D0R1"; const uint32_t k_segment_magic_length = 4; +inline std::string segment_mem_tracker_name(uint32_t segment_id) { + return "SegmentWriter:Segment-" + std::to_string(segment_id); +} + SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, - const SegmentWriterOptions& opts, - std::shared_ptr mow_context, + DataDir* data_dir, const SegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), _tablet(std::move(tablet)), _data_dir(data_dir), - _max_row_per_segment(max_row_per_segment), _opts(opts), _file_writer(file_writer), - _mem_tracker(std::make_unique("SegmentWriter:Segment-" + - std::to_string(segment_id))), - _mow_context(std::move(mow_context)) { + _mem_tracker(std::make_unique(segment_mem_tracker_name(segment_id))), + _mow_context(std::move(opts.mow_ctx)) { CHECK_NOTNULL(file_writer); _num_key_columns = _tablet_schema->num_key_columns(); _num_short_key_columns = _tablet_schema->num_short_key_columns(); @@ -258,8 +253,11 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co if (column.is_row_store_column()) { // smaller page size for row store column - opts.data_page_size = config::row_column_page_size; + auto page_size = _tablet_schema->row_store_page_size(); + opts.data_page_size = + (page_size > 0) ? page_size : segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; } + std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); RETURN_IF_ERROR(writer->init()); @@ -955,11 +953,11 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { auto segment_size = estimate_segment_size(); if (PREDICT_FALSE(segment_size >= MAX_SEGMENT_SIZE || - _num_rows_written >= _max_row_per_segment)) { + _num_rows_written >= _opts.max_rows_per_segment)) { return 0; } int64_t size_rows = ((int64_t)MAX_SEGMENT_SIZE - (int64_t)segment_size) / row_avg_size_in_bytes; - int64_t count_rows = (int64_t)_max_row_per_segment - _num_rows_written; + int64_t count_rows = (int64_t)_opts.max_rows_per_segment - _num_rows_written; return std::min(size_rows, count_rows); } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 9c667ee92fc3b1..41c3d5da3a7d15 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -23,18 +23,14 @@ #include #include -#include #include #include // unique_ptr #include -#include #include #include "common/status.h" // Status #include "gen_cpp/segment_v2.pb.h" -#include "gutil/macros.h" #include "gutil/strings/substitute.h" -#include "io/fs/file_system.h" #include "olap/olap_define.h" #include "olap/rowset/segment_v2/column_writer.h" #include "olap/tablet.h" @@ -71,11 +67,13 @@ extern const uint32_t k_segment_magic_length; struct SegmentWriterOptions { uint32_t num_rows_per_block = 1024; + uint32_t max_rows_per_segment = UINT32_MAX; bool enable_unique_key_merge_on_write = false; CompressionTypePB compression_type = UNKNOWN_COMPRESSION; RowsetWriterContext* rowset_ctx = nullptr; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; + std::shared_ptr mow_ctx; }; using TabletSharedPtr = std::shared_ptr; @@ -84,8 +82,7 @@ class SegmentWriter { public: explicit SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, - uint32_t max_row_per_segment, const SegmentWriterOptions& opts, - std::shared_ptr mow_context, + const SegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer = nullptr); ~SegmentWriter(); @@ -120,7 +117,7 @@ class SegmentWriter { Status finalize(uint64_t* segment_file_size, uint64_t* index_size); - uint32_t get_segment_id() { return _segment_id; } + uint32_t get_segment_id() const { return _segment_id; } Status finalize_columns_data(); Status finalize_columns_index(uint64_t* index_size); @@ -192,7 +189,6 @@ class SegmentWriter { TabletSchemaSPtr _tablet_schema; BaseTabletSPtr _tablet; DataDir* _data_dir = nullptr; - uint32_t _max_row_per_segment; SegmentWriterOptions _opts; // Not owned. owned by RowsetWriter or SegmentFlusher diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 5d2d6ac0769f14..34cfed8502fded 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -80,11 +79,14 @@ using namespace ErrorCode; static const char* k_segment_magic = "D0R1"; static const uint32_t k_segment_magic_length = 4; +inline std::string vertical_segment_writer_mem_tracker_name(uint32_t segment_id) { + return "VerticalSegmentWriter:Segment-" + std::to_string(segment_id); +} + VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, + DataDir* data_dir, const VerticalSegmentWriterOptions& opts, - std::shared_ptr mow_context, io::FileWriterPtr inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), @@ -92,9 +94,9 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 _data_dir(data_dir), _opts(opts), _file_writer(file_writer), - _mem_tracker(std::make_unique("VerticalSegmentWriter:Segment-" + - std::to_string(segment_id))), - _mow_context(std::move(mow_context)) { + _mem_tracker(std::make_unique( + vertical_segment_writer_mem_tracker_name(segment_id))), + _mow_context(std::move(opts.mow_ctx)) { CHECK_NOTNULL(file_writer); _num_key_columns = _tablet_schema->num_key_columns(); _num_short_key_columns = _tablet_schema->num_short_key_columns(); @@ -221,8 +223,11 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo if (column.is_row_store_column()) { // smaller page size for row store column - opts.data_page_size = config::row_column_page_size; + auto page_size = _tablet_schema->row_store_page_size(); + opts.data_page_size = + (page_size > 0) ? page_size : segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; } + std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); RETURN_IF_ERROR(writer->init()); @@ -712,7 +717,9 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& continue; } if (_flush_schema == nullptr) { - _flush_schema = std::make_shared(*_tablet_schema); + _flush_schema = std::make_shared(); + // deep copy + _flush_schema->copy_from(*_tablet_schema); } auto column_ref = data.block->get_by_position(i).column; const vectorized::ColumnObject& object_column = assert_cast( diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h index 8068b3e44be6c8..c52deea40a0359 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h @@ -68,6 +68,7 @@ struct VerticalSegmentWriterOptions { RowsetWriterContext* rowset_ctx = nullptr; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; + std::shared_ptr mow_ctx; }; struct RowsInBlock { @@ -80,9 +81,7 @@ class VerticalSegmentWriter { public: explicit VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, - const VerticalSegmentWriterOptions& opts, - std::shared_ptr mow_context, + DataDir* data_dir, const VerticalSegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer = nullptr); ~VerticalSegmentWriter(); diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 1de7d4f50dce8c..1db74843697a76 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -83,36 +83,32 @@ Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); } else { // value columns - uint32_t num_rows_written = _segment_writers[_cur_writer_idx]->num_rows_written(); - VLOG_NOTICE << "num_rows_written: " << num_rows_written - << ", _cur_writer_idx: " << _cur_writer_idx; - uint32_t num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); - // init if it's first value column write in current segment - if (_cur_writer_idx == 0 && num_rows_written == 0) { - VLOG_NOTICE << "init first value column segment writer"; - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); - } - // when splitting segment, need to make rows align between key columns and value columns - size_t start_offset = 0; - size_t limit = num_rows; - if (num_rows_written + num_rows >= num_rows_key_group && - _cur_writer_idx < _segment_writers.size() - 1) { - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block( - block, 0, num_rows_key_group - num_rows_written)); - RETURN_IF_ERROR(_flush_columns(_segment_writers[_cur_writer_idx].get())); - start_offset = num_rows_key_group - num_rows_written; - limit = num_rows - start_offset; - ++_cur_writer_idx; - // switch to next writer - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); - num_rows_written = 0; - num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); - } - if (limit > 0) { - RETURN_IF_ERROR( - _segment_writers[_cur_writer_idx]->append_block(block, start_offset, limit)); - DCHECK(_segment_writers[_cur_writer_idx]->num_rows_written() <= - _segment_writers[_cur_writer_idx]->row_count()); + int64_t left = num_rows; + while (left > 0) { + uint32_t num_rows_written = _segment_writers[_cur_writer_idx]->num_rows_written(); + VLOG_NOTICE << "num_rows_written: " << num_rows_written + << ", _cur_writer_idx: " << _cur_writer_idx; + uint32_t num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); + CHECK_LT(num_rows_written, num_rows_key_group); + // init if it's first value column write in current segment + if (num_rows_written == 0) { + VLOG_NOTICE << "init first value column segment writer"; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); + } + + int64_t to_write = num_rows_written + left >= num_rows_key_group + ? num_rows_key_group - num_rows_written + : left; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, num_rows - left, + to_write)); + left -= to_write; + CHECK_GE(left, 0); + + if (num_rows_key_group == num_rows_written + to_write && + _cur_writer_idx < _segment_writers.size() - 1) { + RETURN_IF_ERROR(_flush_columns(_segment_writers[_cur_writer_idx].get())); + ++_cur_writer_idx; + } } } if (is_key) { @@ -190,9 +186,10 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( segment_v2::SegmentWriterOptions writer_options; writer_options.enable_unique_key_merge_on_write = context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &context; - *writer = std::make_unique( - file_writer.get(), seg_id, context.tablet_schema, context.tablet, context.data_dir, - context.max_rows_per_segment, writer_options, nullptr); + writer_options.max_rows_per_segment = context.max_rows_per_segment; + *writer = std::make_unique(file_writer.get(), seg_id, + context.tablet_schema, context.tablet, + context.data_dir, writer_options); RETURN_IF_ERROR(this->_seg_files.add(seg_id, std::move(file_writer))); auto s = (*writer)->init(column_ids, is_key); diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index 2fc3d58b49141c..85006cc183a79d 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -36,6 +36,7 @@ #include "io/fs/file_writer.h" // IWYU pragma: keep #include "olap/calc_delete_bitmap_executor.h" #include "olap/olap_define.h" +#include "olap/partial_update_info.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/beta_rowset_writer.h" #include "olap/rowset/pending_rowset_helper.h" @@ -123,7 +124,7 @@ void RowsetBuilder::_garbage_collection() { Status BaseRowsetBuilder::init_mow_context(std::shared_ptr& mow_context) { std::lock_guard lck(tablet()->get_header_lock()); - int64_t cur_max_version = tablet()->max_version_unlocked(); + _max_version_in_flush_phase = tablet()->max_version_unlocked(); std::vector rowset_ptrs; // tablet is under alter process. The delete bitmap will be calculated after conversion. if (tablet()->tablet_state() == TABLET_NOTREADY) { @@ -135,12 +136,13 @@ Status BaseRowsetBuilder::init_mow_context(std::shared_ptr& mow_cont } _rowset_ids.clear(); } else { - RETURN_IF_ERROR(tablet()->get_all_rs_id_unlocked(cur_max_version, &_rowset_ids)); + RETURN_IF_ERROR( + tablet()->get_all_rs_id_unlocked(_max_version_in_flush_phase, &_rowset_ids)); rowset_ptrs = tablet()->get_rowset_by_ids(&_rowset_ids); } _delete_bitmap = std::make_shared(tablet()->tablet_id()); - mow_context = std::make_shared(cur_max_version, _req.txn_id, _rowset_ids, - rowset_ptrs, _delete_bitmap); + mow_context = std::make_shared(_max_version_in_flush_phase, _req.txn_id, + _rowset_ids, rowset_ptrs, _delete_bitmap); return Status::OK(); } @@ -170,8 +172,10 @@ Status RowsetBuilder::check_tablet_version_count() { Status RowsetBuilder::prepare_txn() { std::shared_lock base_migration_lock(tablet()->get_migration_lock(), std::defer_lock); - if (!base_migration_lock.try_lock_for(std::chrono::milliseconds(30))) { - return Status::Error("try migration lock failed"); + if (!base_migration_lock.try_lock_for( + std::chrono::milliseconds(config::migration_lock_timeout_ms))) { + return Status::Error("try_lock migration lock failed after {}ms", + config::migration_lock_timeout_ms); } std::lock_guard push_lock(tablet()->get_push_lock()); return _engine.txn_manager()->prepare_txn(_req.partition_id, *tablet(), _req.txn_id, @@ -406,7 +410,8 @@ void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, table_schema_param->partial_update_input_columns(), table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), table_schema_param->timezone(), - table_schema_param->auto_increment_coulumn()); + table_schema_param->auto_increment_coulumn(), + _max_version_in_flush_phase); } } // namespace doris diff --git a/be/src/olap/rowset_builder.h b/be/src/olap/rowset_builder.h index e54faee3435c79..7fd578037363a0 100644 --- a/be/src/olap/rowset_builder.h +++ b/be/src/olap/rowset_builder.h @@ -106,6 +106,7 @@ class BaseRowsetBuilder { std::unique_ptr _calc_delete_bitmap_token; // current rowset_ids, used to do diff in publish_version RowsetIdUnorderedSet _rowset_ids; + int64_t _max_version_in_flush_phase {-1}; std::shared_ptr _partial_update_info; diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index 599d9c1d1423ca..38dbcf1c429bb4 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -291,6 +291,7 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, // swap ref_block[key] and new_block[value] std::list> swap_idx_list; for (int idx = 0; idx < column_size; idx++) { + // just for MV, schema change should not run into this branch if (_schema_mapping[idx].expr != nullptr) { vectorized::VExprContextSPtr ctx; RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*_schema_mapping[idx].expr, ctx)); @@ -367,7 +368,7 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, return Status::OK(); } -// This check is to prevent schema-change from causing data loss +// This check is for MV to prevent schema-change from causing data loss Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr ref_column, vectorized::ColumnPtr new_column, AlterTabletType type) { if (ref_column->size() != new_column->size()) { diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index eb0f046270db2b..64ab0c724d0345 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -117,8 +117,8 @@ class SchemaChange { _filtered_rows = 0; _merged_rows = 0; - RETURN_IF_ERROR(_inner_process(rowset_reader, rowset_writer, new_tablet, base_tablet_schema, - new_tablet_schema)); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(_inner_process(rowset_reader, rowset_writer, new_tablet, + base_tablet_schema, new_tablet_schema)); // Check row num changes if (!_check_row_nums(rowset_reader, *rowset_writer)) { diff --git a/be/src/olap/simple_rowid_conversion.h b/be/src/olap/simple_rowid_conversion.h new file mode 100644 index 00000000000000..1a89b01838fe8c --- /dev/null +++ b/be/src/olap/simple_rowid_conversion.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "olap/olap_common.h" +#include "olap/utils.h" + +namespace doris { + +// Simple verion of rowid conversion, for segcompaction +// convert rows from several segments to rows in 1 segment +class SimpleRowIdConversion { +public: + SimpleRowIdConversion(const RowsetId& rowset_id) : _rowst_id(rowset_id) {}; + ~SimpleRowIdConversion() = default; + + // resize segment rowid map to its rows num + void reset_segment_map(const std::map& num_rows) { + _cur_dst_segment_rowid = 0; + for (auto seg_rows : num_rows) { + _segments_rowid_map.emplace(seg_rows.first, + std::vector(seg_rows.second, UINT32_MAX)); + } + } + + // add row id to the map + void add(const std::vector& rss_row_ids) { + for (auto& item : rss_row_ids) { + if (item.row_id == -1) { + continue; + } + DCHECK(_segments_rowid_map.find(item.segment_id) != _segments_rowid_map.end() && + _segments_rowid_map[item.segment_id].size() > item.row_id); + _segments_rowid_map[item.segment_id][item.row_id] = _cur_dst_segment_rowid++; + } + } + + // get destination RowLocation + // return non-zero if the src RowLocation does not exist + int get(const RowLocation& src) const { + auto it = _segments_rowid_map.find(src.segment_id); + if (it == _segments_rowid_map.end()) { + return -1; + } + const auto& rowid_map = it->second; + if (src.row_id >= rowid_map.size() || UINT32_MAX == rowid_map[src.row_id]) { + return -1; + } + + return rowid_map[src.row_id]; + } + +private: + // key: index indicates src segment. + // value: index indicates row id of source segment, value indicates row id of destination + // segment. UINT32_MAX indicates current row not exist. + std::map> _segments_rowid_map; + + // dst rowset id + RowsetId _rowst_id; + + // current rowid of dst segment + std::uint32_t _cur_dst_segment_rowid = 0; +}; + +} // namespace doris diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 43093d3183e438..ae820364b89b2a 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -18,29 +18,27 @@ #include "olap/storage_engine.h" // IWYU pragma: no_include -#include -#include // IWYU pragma: keep #include #include #include #include #include #include -#include -#include #include #include #include #include #include +#include +#include // IWYU pragma: keep #include +#include +#include #include #include #include -#include #include -#include #include #include #include @@ -52,28 +50,21 @@ #include "common/logging.h" #include "common/status.h" #include "gutil/strings/substitute.h" -#include "io/fs/file_system.h" #include "io/fs/local_file_system.h" -#include "olap/base_compaction.h" #include "olap/binlog.h" -#include "olap/cumulative_compaction.h" #include "olap/data_dir.h" -#include "olap/full_compaction.h" #include "olap/memtable_flush_executor.h" #include "olap/olap_common.h" #include "olap/olap_define.h" -#include "olap/olap_meta.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/unique_rowset_id_generator.h" #include "olap/schema_cache.h" -#include "olap/segment_loader.h" #include "olap/single_replica_compaction.h" #include "olap/snapshot_manager.h" #include "olap/tablet_manager.h" #include "olap/tablet_meta.h" #include "olap/tablet_meta_manager.h" -#include "olap/task/engine_task.h" #include "olap/txn_manager.h" #include "runtime/stream_load/stream_load_recorder.h" #include "util/doris_metrics.h" @@ -83,7 +74,6 @@ #include "util/stopwatch.hpp" #include "util/thread.h" #include "util/threadpool.h" -#include "util/trace.h" #include "util/uid_util.h" #include "util/work_thread_pool.hpp" @@ -134,6 +124,64 @@ int64_t BaseStorageEngine::memory_limitation_bytes_per_thread_for_schema_change( config::memory_limitation_per_thread_for_schema_change_bytes); } +Status BaseStorageEngine::init_stream_load_recorder(const std::string& stream_load_record_path) { + LOG(INFO) << "stream load record path: " << stream_load_record_path; + // init stream load record rocksdb + _stream_load_recorder = StreamLoadRecorder::create_shared(stream_load_record_path); + if (_stream_load_recorder == nullptr) { + RETURN_NOT_OK_STATUS_WITH_WARN( + Status::MemoryAllocFailed("allocate memory for StreamLoadRecorder failed"), + "new StreamLoadRecorder failed"); + } + auto st = _stream_load_recorder->init(); + if (!st.ok()) { + RETURN_NOT_OK_STATUS_WITH_WARN( + Status::IOError("open StreamLoadRecorder rocksdb failed, path={}", + stream_load_record_path), + "init StreamLoadRecorder failed"); + } + return Status::OK(); +} + +void CompactionSubmitRegistry::jsonfy_compaction_status(std::string* result) { + rapidjson::Document root; + root.SetObject(); + + auto add_node = [&root](const std::string& name, const Registry& registry) { + rapidjson::Value key; + key.SetString(name.c_str(), name.length(), root.GetAllocator()); + rapidjson::Document path_obj; + path_obj.SetObject(); + for (const auto& it : registry) { + const auto& dir = it.first->path(); + rapidjson::Value path_key; + path_key.SetString(dir.c_str(), dir.length(), root.GetAllocator()); + + rapidjson::Document arr; + arr.SetArray(); + + for (const auto& tablet : it.second) { + rapidjson::Value key; + auto key_str = std::to_string(tablet->tablet_id()); + key.SetString(key_str.c_str(), key_str.length(), root.GetAllocator()); + arr.PushBack(key, root.GetAllocator()); + } + path_obj.AddMember(path_key, arr, root.GetAllocator()); + } + root.AddMember(key, path_obj, root.GetAllocator()); + }; + + std::unique_lock l(_tablet_submitted_compaction_mutex); + add_node("BaseCompaction", _tablet_submitted_base_compaction); + add_node("CumulativeCompaction", _tablet_submitted_cumu_compaction); + add_node("FullCompaction", _tablet_submitted_full_compaction); + + rapidjson::StringBuffer str_buf; + rapidjson::PrettyWriter writer(str_buf); + root.Accept(writer); + *result = std::string(str_buf.GetString()); +} + static Status _validate_options(const EngineOptions& options) { if (options.store_paths.empty()) { return Status::InternalError("store paths is empty"); @@ -158,7 +206,6 @@ StorageEngine::StorageEngine(const EngineOptions& options) _tablet_manager(new TabletManager(*this, config::tablet_map_shard_size)), _txn_manager(new TxnManager(*this, config::txn_map_shard_size, config::txn_shard_size)), _default_rowset_type(BETA_ROWSET), - _stream_load_recorder(nullptr), _create_tablet_idx_lru_cache( new CreateTabletIdxCache(config::partition_disk_index_lru_size)), _snapshot_mgr(std::make_unique(*this)) { @@ -274,31 +321,12 @@ Status StorageEngine::_init_store_map() { return Status::InternalError("init path failed, error={}", error_msg); } - RETURN_NOT_OK_STATUS_WITH_WARN(_init_stream_load_recorder(_options.store_paths[0].path), + RETURN_NOT_OK_STATUS_WITH_WARN(init_stream_load_recorder(_options.store_paths[0].path), "init StreamLoadRecorder failed"); return Status::OK(); } -Status StorageEngine::_init_stream_load_recorder(const std::string& stream_load_record_path) { - LOG(INFO) << "stream load record path: " << stream_load_record_path; - // init stream load record rocksdb - _stream_load_recorder = StreamLoadRecorder::create_shared(stream_load_record_path); - if (_stream_load_recorder == nullptr) { - RETURN_NOT_OK_STATUS_WITH_WARN( - Status::MemoryAllocFailed("allocate memory for StreamLoadRecorder failed"), - "new StreamLoadRecorder failed"); - } - auto st = _stream_load_recorder->init(); - if (!st.ok()) { - RETURN_NOT_OK_STATUS_WITH_WARN( - Status::IOError("open StreamLoadRecorder rocksdb failed, path={}", - stream_load_record_path), - "init StreamLoadRecorder failed"); - } - return Status::OK(); -} - void StorageEngine::_update_storage_medium_type_count() { set available_storage_medium_types; @@ -1384,89 +1412,8 @@ bool StorageEngine::should_fetch_from_peer(int64_t tablet_id) { // "/home/disk2" : [10003] // } // } -Status StorageEngine::get_compaction_status_json(std::string* result) { - rapidjson::Document root; - root.SetObject(); - - std::unique_lock lock(_tablet_submitted_compaction_mutex); - const std::string& cumu = "CumulativeCompaction"; - rapidjson::Value cumu_key; - cumu_key.SetString(cumu.c_str(), cumu.length(), root.GetAllocator()); - - // cumu - rapidjson::Document path_obj; - path_obj.SetObject(); - for (auto& it : _tablet_submitted_cumu_compaction) { - const std::string& dir = it.first->path(); - rapidjson::Value path_key; - path_key.SetString(dir.c_str(), dir.length(), path_obj.GetAllocator()); - - rapidjson::Document arr; - arr.SetArray(); - - for (auto& tablet : it.second) { - rapidjson::Value key; - const std::string& key_str = std::to_string(tablet->tablet_id()); - key.SetString(key_str.c_str(), key_str.length(), path_obj.GetAllocator()); - arr.PushBack(key, root.GetAllocator()); - } - path_obj.AddMember(path_key, arr, path_obj.GetAllocator()); - } - root.AddMember(cumu_key, path_obj, root.GetAllocator()); - - // base - const std::string& base = "BaseCompaction"; - rapidjson::Value base_key; - base_key.SetString(base.c_str(), base.length(), root.GetAllocator()); - rapidjson::Document path_obj2; - path_obj2.SetObject(); - for (auto& it : _tablet_submitted_base_compaction) { - const std::string& dir = it.first->path(); - rapidjson::Value path_key; - path_key.SetString(dir.c_str(), dir.length(), path_obj2.GetAllocator()); - - rapidjson::Document arr; - arr.SetArray(); - - for (auto& tablet : it.second) { - rapidjson::Value key; - const std::string& key_str = std::to_string(tablet->tablet_id()); - key.SetString(key_str.c_str(), key_str.length(), path_obj2.GetAllocator()); - arr.PushBack(key, root.GetAllocator()); - } - path_obj2.AddMember(path_key, arr, path_obj2.GetAllocator()); - } - root.AddMember(base_key, path_obj2, root.GetAllocator()); - - // full - const std::string& full = "FullCompaction"; - rapidjson::Value full_key; - full_key.SetString(full.c_str(), full.length(), root.GetAllocator()); - rapidjson::Document path_obj3; - path_obj3.SetObject(); - for (auto& it : _tablet_submitted_full_compaction) { - const std::string& dir = it.first->path(); - rapidjson::Value path_key; - path_key.SetString(dir.c_str(), dir.length(), path_obj3.GetAllocator()); - - rapidjson::Document arr; - arr.SetArray(); - - for (auto& tablet : it.second) { - rapidjson::Value key; - const std::string& key_str = std::to_string(tablet->tablet_id()); - key.SetString(key_str.c_str(), key_str.length(), path_obj3.GetAllocator()); - arr.PushBack(key, root.GetAllocator()); - } - path_obj3.AddMember(path_key, arr, path_obj3.GetAllocator()); - } - root.AddMember(full_key, path_obj3, root.GetAllocator()); - - rapidjson::StringBuffer strbuf; - rapidjson::PrettyWriter writer(strbuf); - root.Accept(writer); - *result = std::string(strbuf.GetString()); - return Status::OK(); +void StorageEngine::get_compaction_status_json(std::string* result) { + _compaction_submit_registry.jsonfy_compaction_status(result); } void BaseStorageEngine::add_quering_rowset(RowsetSharedPtr rs) { diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 64312a2b2b7e20..7eb94047756770 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -21,10 +21,10 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -75,6 +75,8 @@ class SnapshotManager; using SegCompactionCandidates = std::vector; using SegCompactionCandidatesSharedPtr = std::shared_ptr; +using CumuCompactionPolicyTable = + std::unordered_map>; class StorageEngine; class CloudStorageEngine; @@ -133,6 +135,12 @@ class BaseStorageEngine { int get_disk_num() { return _disk_num; } + Status init_stream_load_recorder(const std::string& stream_load_record_path); + + const std::shared_ptr& get_stream_load_recorder() { + return _stream_load_recorder; + } + protected: void _evict_querying_rowset(); void _evict_quring_rowset_thread_callback(); @@ -157,6 +165,46 @@ class BaseStorageEngine { int64_t _memory_limitation_bytes_for_schema_change; int _disk_num {-1}; + + std::shared_ptr _stream_load_recorder; +}; + +class CompactionSubmitRegistry { + using TabletSet = std::unordered_set; + using Registry = std::map; + +public: + CompactionSubmitRegistry() = default; + CompactionSubmitRegistry(CompactionSubmitRegistry&& r); + + // create a snapshot for current registry, operations to the snapshot can be lock-free. + CompactionSubmitRegistry create_snapshot(); + + void reset(const std::vector& stores); + + uint32_t count_executing_compaction(DataDir* dir, CompactionType compaction_type); + uint32_t count_executing_cumu_and_base(DataDir* dir); + + bool has_compaction_task(DataDir* dir, CompactionType compaction_type); + + bool insert(TabletSharedPtr tablet, CompactionType compaction_type); + + void remove(TabletSharedPtr tablet, CompactionType compaction_type, + std::function wakeup_cb); + + void jsonfy_compaction_status(std::string* result); + + std::vector pick_topn_tablets_for_compaction( + TabletManager* tablet_mgr, DataDir* data_dir, CompactionType compaction_type, + const CumuCompactionPolicyTable& cumu_compaction_policies, uint32_t* disk_max_score); + +private: + TabletSet& _get_tablet_set(DataDir* dir, CompactionType compaction_type); + + std::mutex _tablet_submitted_compaction_mutex; + Registry _tablet_submitted_cumu_compaction; + Registry _tablet_submitted_base_compaction; + Registry _tablet_submitted_full_compaction; }; class StorageEngine final : public BaseStorageEngine { @@ -250,13 +298,10 @@ class StorageEngine final : public BaseStorageEngine { return _stream_load_recorder; } - Status get_compaction_status_json(std::string* result); - - // check cumulative compaction config - void check_cumulative_compaction_config(); + void get_compaction_status_json(std::string* result); Status submit_compaction_task(TabletSharedPtr tablet, CompactionType compaction_type, - bool force); + bool force, bool eager = true); Status submit_seg_compaction_task(std::shared_ptr worker, SegCompactionCandidatesSharedPtr segments); @@ -316,9 +361,6 @@ class StorageEngine final : public BaseStorageEngine { // delete tablet with io error process function void _disk_stat_monitor_thread_callback(); - // clean file descriptors cache - void _cache_clean_callback(); - // path gc process function void _path_gc_thread_callback(DataDir* data_dir); @@ -344,13 +386,9 @@ class StorageEngine final : public BaseStorageEngine { bool check_score); void _update_cumulative_compaction_policy(); - bool _push_tablet_into_submitted_compaction(TabletSharedPtr tablet, - CompactionType compaction_type); void _pop_tablet_from_submitted_compaction(TabletSharedPtr tablet, CompactionType compaction_type); - Status _init_stream_load_recorder(const std::string& stream_load_record_path); - Status _submit_compaction_task(TabletSharedPtr tablet, CompactionType compaction_type, bool force); @@ -389,8 +427,6 @@ class StorageEngine final : public BaseStorageEngine { int32_t _auto_get_interval_by_disk_capacity(DataDir* data_dir); - int _get_executing_compaction_num(std::unordered_set& compaction_tasks); - private: EngineOptions _options; std::mutex _store_lock; @@ -451,11 +487,7 @@ class StorageEngine final : public BaseStorageEngine { CompactionPermitLimiter _permit_limiter; - std::mutex _tablet_submitted_compaction_mutex; - // a tablet can do base and cumulative compaction at same time - std::map> _tablet_submitted_cumu_compaction; - std::map> _tablet_submitted_base_compaction; - std::map> _tablet_submitted_full_compaction; + CompactionSubmitRegistry _compaction_submit_registry; std::mutex _low_priority_task_nums_mutex; std::unordered_map _low_priority_task_nums; @@ -470,11 +502,8 @@ class StorageEngine final : public BaseStorageEngine { std::mutex _compaction_producer_sleep_mutex; std::condition_variable _compaction_producer_sleep_cv; - std::shared_ptr _stream_load_recorder; - // we use unordered_map to store all cumulative compaction policy sharded ptr - std::unordered_map> - _cumulative_compaction_policies; + CumuCompactionPolicyTable _cumulative_compaction_policies; scoped_refptr _cooldown_tasks_producer_thread; scoped_refptr _remove_unused_remote_files_thread; @@ -496,9 +525,6 @@ class StorageEngine final : public BaseStorageEngine { scoped_refptr _async_publish_thread; std::shared_mutex _async_publish_lock; - bool _clear_segment_cache = false; - bool _clear_page_cache = false; - std::atomic _need_clean_trash {false}; // next index for create tablet diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 8b6ebc2c395252..1a1d3be6bc9040 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -108,7 +108,6 @@ #include "service/point_query_executor.h" #include "tablet.h" #include "util/bvar_helper.h" -#include "util/crc32c.h" #include "util/debug_points.h" #include "util/defer_op.h" #include "util/doris_metrics.h" @@ -1714,7 +1713,12 @@ Status Tablet::prepare_compaction_and_calculate_permits( } } - permits = compaction->get_compaction_permits(); + // Time series policy does not rely on permits, it uses goal size to control memory + if (tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { + permits = 0; + } else { + permits = compaction->get_compaction_permits(); + } return Status::OK(); } @@ -2647,35 +2651,4 @@ void Tablet::clear_cache() { recycle_segment_cache(stale_rowset_map()); } -Status Tablet::calc_local_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, - int32_t* rowset_count, int64_t* file_count) { - Version v(start_version, end_version); - std::vector rowsets; - traverse_rowsets([&rowsets, &v](const auto& rs) { - // get local rowsets - if (rs->is_local() && v.contains(rs->version())) { - rowsets.emplace_back(rs); - } - }); - std::sort(rowsets.begin(), rowsets.end(), Rowset::comparator); - *rowset_count = rowsets.size(); - - *crc_value = 0; - *file_count = 0; - for (const auto& rs : rowsets) { - uint32_t rs_crc_value; - int64_t rs_file_count = 0; - auto rowset = std::static_pointer_cast(rs); - auto st = rowset->calc_local_file_crc(&rs_crc_value, &rs_file_count); - if (!st.ok()) { - return st; - } - // crc_value is calculated based on the crc_value of each rowset. - *crc_value = crc32c::Extend(*crc_value, reinterpret_cast(&rs_crc_value), - sizeof(rs_crc_value)); - *file_count += rs_file_count; - } - return Status::OK(); -} - } // namespace doris diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 759e3e65614f2e..fa11c2d868569f 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -421,18 +421,6 @@ class Tablet final : public BaseTablet { int64_t start = -1); bool should_skip_compaction(CompactionType compaction_type, int64_t now); - void traverse_rowsets(std::function visitor, - bool include_stale = false) { - std::shared_lock rlock(_meta_lock); - for (auto& [v, rs] : _rs_version_map) { - visitor(rs); - } - if (!include_stale) return; - for (auto& [v, rs] : _stale_rs_version_map) { - visitor(rs); - } - } - std::vector get_binlog_filepath(std::string_view binlog_version) const; std::pair get_binlog_info(std::string_view binlog_version) const; std::string get_rowset_binlog_meta(std::string_view binlog_version, @@ -483,8 +471,6 @@ class Tablet final : public BaseTablet { } inline bool is_full_compaction_running() const { return _is_full_compaction_running; } void clear_cache() override; - Status calc_local_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, - int32_t* rowset_count, int64_t* file_count); private: Status _init_once_action(); diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index a234ab93a47638..6696dcf2e68df2 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -226,7 +226,7 @@ Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, // If the new tablet is fresher than the existing one, then replace // the existing tablet with the new one. // Use default replica_id to ignore whether replica_id is match when drop tablet. - Status status = _drop_tablet_unlocked(tablet_id, /* replica_id */ 0, keep_files, false); + Status status = _drop_tablet(tablet_id, /* replica_id */ 0, keep_files, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"), static_cast(watch.reset())); RETURN_NOT_OK_STATUS_WITH_WARN( @@ -438,7 +438,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked( } // something is wrong, we need clear environment if (is_tablet_added) { - Status status = _drop_tablet_unlocked(new_tablet_id, request.replica_id, false, false); + Status status = _drop_tablet(new_tablet_id, request.replica_id, false, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", parent_timer_name), static_cast(watch.reset())); if (!status.ok()) { @@ -522,14 +522,12 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool is_drop_table_or_partition) { - auto& shard = _get_tablets_shard(tablet_id); - std::lock_guard wrlock(shard.lock); - return _drop_tablet_unlocked(tablet_id, replica_id, false, is_drop_table_or_partition); + return _drop_tablet(tablet_id, replica_id, false, is_drop_table_or_partition, false); } // Drop specified tablet. -Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, - bool keep_files, bool is_drop_table_or_partition) { +Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock) { LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id << ", is_drop_table_or_partition=" << is_drop_table_or_partition; DorisMetrics::instance()->drop_tablet_requests_total->increment(1); @@ -538,23 +536,31 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId repl Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop tablet"); }}; // Fetch tablet which need to be dropped - TabletSharedPtr to_drop_tablet = _get_tablet_unlocked(tablet_id); - if (to_drop_tablet == nullptr) { - LOG(WARNING) << "fail to drop tablet because it does not exist. " - << "tablet_id=" << tablet_id; - return Status::OK(); - } + TabletSharedPtr to_drop_tablet; + { + std::unique_lock wlock(_get_tablets_shard_lock(tablet_id), + std::defer_lock); + if (!had_held_shard_lock) { + wlock.lock(); + } + to_drop_tablet = _get_tablet_unlocked(tablet_id); + if (to_drop_tablet == nullptr) { + LOG(WARNING) << "fail to drop tablet because it does not exist. " + << "tablet_id=" << tablet_id; + return Status::OK(); + } - // We should compare replica id to avoid dropping new cloned tablet. - // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. - if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { - return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), - replica_id); - } + // We should compare replica id to avoid dropping new cloned tablet. + // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. + if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { + return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), + replica_id); + } - _remove_tablet_from_partition(to_drop_tablet); - tablet_map_t& tablet_map = _get_tablet_map(tablet_id); - tablet_map.erase(tablet_id); + _remove_tablet_from_partition(to_drop_tablet); + tablet_map_t& tablet_map = _get_tablet_map(tablet_id); + tablet_map.erase(tablet_id); + } to_drop_tablet->clear_cache(); diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index 809a2237356dd7..42623cf05f2aea 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -194,8 +194,8 @@ class TabletManager { bool _check_tablet_id_exist_unlocked(TTabletId tablet_id); - Status _drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, - bool is_drop_table_or_partition); + Status _drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id, bool include_deleted, diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index cced41a86eee8b..a3526781dddd87 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -317,6 +317,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id if (tablet_schema.__isset.store_row_column) { schema->set_store_row_column(tablet_schema.store_row_column); } + if (tablet_schema.__isset.row_store_page_size) { + schema->set_row_store_page_size(tablet_schema.row_store_page_size); + } if (tablet_schema.__isset.skip_write_index_on_load) { schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load); } diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index e65a10ac73eccb..631a041379a9af 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -640,14 +640,19 @@ Status TabletReader::_init_delete_condition(const ReaderParams& read_params) { !config::enable_delete_when_cumu_compaction)) { return Status::OK(); } - // Only BASE_COMPACTION and COLD_DATA_COMPACTION and CUMULATIVE_COMPACTION need set filter_delete = true - // other reader type: - // QUERY will filter the row in query layer to keep right result use where clause. - _filter_delete = (read_params.reader_type == ReaderType::READER_BASE_COMPACTION || - read_params.reader_type == ReaderType::READER_COLD_DATA_COMPACTION || - ((read_params.reader_type == ReaderType::READER_CUMULATIVE_COMPACTION && - config::enable_delete_when_cumu_compaction)) || - read_params.reader_type == ReaderType::READER_CHECKSUM); + bool cumu_delete = read_params.reader_type == ReaderType::READER_CUMULATIVE_COMPACTION && + config::enable_delete_when_cumu_compaction; + // Delete sign could not be applied when delete on cumu compaction is enabled, bucause it is meant for delete with predicates. + // If delete design is applied on cumu compaction, it will lose effect when doing base compaction. + // `_delete_sign_available` indicates the condition where we could apply delete signs to data. + _delete_sign_available = (read_params.reader_type == ReaderType::READER_BASE_COMPACTION || + read_params.reader_type == ReaderType::READER_COLD_DATA_COMPACTION || + read_params.reader_type == ReaderType::READER_CHECKSUM); + + // `_filter_delete` indicates the condition where we should execlude deleted tuples when reading data. + // However, queries will not use this condition but generate special where predicates to filter data. + // (Though a lille bit confused, it is how the current logic working...) + _filter_delete = _delete_sign_available || cumu_delete; auto* runtime_state = read_params.runtime_state; bool enable_sub_pred_v2 = runtime_state == nullptr ? true : runtime_state->enable_delete_sub_pred_v2(); diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index c257ba007f531a..06c3daa653a33b 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -295,6 +295,7 @@ class TabletReader { // for agg query, we don't need to finalize when scan agg object data ReaderType _reader_type = ReaderType::READER_QUERY; bool _next_delete_flag = false; + bool _delete_sign_available = false; bool _filter_delete = false; int32_t _sequence_col_idx = -1; bool _direct_mode = false; diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index ec887f14a91377..b3dde488674bea 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -550,11 +550,18 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { _visible = column.visible(); } if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - CHECK(column.children_columns_size() == 1) << "ARRAY type has more than 1 children types."; + CHECK(column.children_columns_size() == 1) + << "ARRAY type should has 1 children types, but got " + << column.children_columns_size(); } if (_type == FieldType::OLAP_FIELD_TYPE_MAP) { - DCHECK(column.children_columns_size() == 2) << "MAP type has more than 2 children types."; - LOG(WARNING) << "MAP type has more than 2 children types."; + DCHECK(column.children_columns_size() == 2) + << "MAP type should has 2 children types, but got " + << column.children_columns_size(); + if (UNLIKELY(column.children_columns_size() != 2)) { + LOG(WARNING) << "MAP type should has 2 children types, but got " + << column.children_columns_size(); + } } for (size_t i = 0; i < column.children_columns_size(); i++) { TabletColumn child_column; @@ -621,11 +628,15 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_visible(_visible); if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - CHECK(_sub_columns.size() == 1) << "ARRAY type has more than 1 children types."; + CHECK(_sub_columns.size() == 1) + << "ARRAY type should has 1 children types, but got " << _sub_columns.size(); } if (_type == FieldType::OLAP_FIELD_TYPE_MAP) { - DCHECK(_sub_columns.size() == 2) << "MAP type has more than 2 children types."; - LOG(WARNING) << "MAP type has more than 2 children types."; + DCHECK(_sub_columns.size() == 2) + << "MAP type should has 2 children types, but got " << _sub_columns.size(); + if (UNLIKELY(_sub_columns.size() != 2)) { + LOG(WARNING) << "MAP type should has 2 children types, but got " << _sub_columns.size(); + } } for (size_t i = 0; i < _sub_columns.size(); i++) { @@ -990,6 +1001,7 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _sort_type = schema.sort_type(); _sort_col_num = schema.sort_col_num(); _compression_type = schema.compression_type(); + _row_store_page_size = schema.row_store_page_size(); _schema_version = schema.schema_version(); // Default to V1 inverted index storage format for backward compatibility if not specified in schema. if (!schema.has_inverted_index_storage_format()) { @@ -1050,6 +1062,7 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _skip_write_index_on_load = ori_tablet_schema.skip_write_index_on_load(); _sort_type = ori_tablet_schema.sort_type(); _sort_col_num = ori_tablet_schema.sort_col_num(); + _row_store_page_size = ori_tablet_schema.row_store_page_size(); // copy from table_schema_param _schema_version = version; @@ -1203,6 +1216,7 @@ void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const { tablet_schema_pb->set_sort_col_num(_sort_col_num); tablet_schema_pb->set_schema_version(_schema_version); tablet_schema_pb->set_compression_type(_compression_type); + tablet_schema_pb->set_row_store_page_size(_row_store_page_size); tablet_schema_pb->set_version_col_idx(_version_col_idx); tablet_schema_pb->set_inverted_index_storage_format(_inverted_index_storage_format); tablet_schema_pb->mutable_row_store_column_unique_ids()->Assign( @@ -1522,6 +1536,7 @@ bool operator==(const TabletSchema& a, const TabletSchema& b) { if (a._disable_auto_compaction != b._disable_auto_compaction) return false; if (a._enable_single_replica_compaction != b._enable_single_replica_compaction) return false; if (a._store_row_column != b._store_row_column) return false; + if (a._row_store_page_size != b._row_store_page_size) return false; if (a._skip_write_index_on_load != b._skip_write_index_on_load) return false; return true; } diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 3a78f2e4748a97..8cf6e20208c90f 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -36,6 +36,7 @@ #include "common/status.h" #include "gutil/stringprintf.h" #include "olap/olap_common.h" +#include "olap/rowset/segment_v2/options.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" #include "util/string_util.h" @@ -359,6 +360,8 @@ class TabletSchema { void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; } int32_t version_col_idx() const { return _version_col_idx; } segment_v2::CompressionTypePB compression_type() const { return _compression_type; } + void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; } + long row_store_page_size() const { return _row_store_page_size; } const std::vector& indexes() const { return _indexes; } bool has_inverted_index() const { @@ -508,6 +511,7 @@ class TabletSchema { size_t _num_rows_per_row_block = 0; CompressKind _compress_kind = COMPRESS_NONE; segment_v2::CompressionTypePB _compression_type = segment_v2::CompressionTypePB::LZ4F; + long _row_store_page_size = segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; size_t _next_column_unique_id = 0; std::string _auto_increment_column; diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 40b789cf873fcb..2b7388aa7c7aeb 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -190,7 +190,7 @@ Status EngineCloneTask::_do_clone() { tablet->replica_id(), false)); tablet.reset(); } - bool is_new_tablet = tablet == nullptr; + _is_new_tablet = tablet == nullptr; // try to incremental clone Versions missed_versions; // try to repair a tablet with missing version @@ -228,7 +228,7 @@ Status EngineCloneTask::_do_clone() { if (missed_versions.empty()) { LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id=" << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id; - RETURN_IF_ERROR(_set_tablet_info(is_new_tablet)); + RETURN_IF_ERROR(_set_tablet_info()); return Status::OK(); } @@ -307,10 +307,11 @@ Status EngineCloneTask::_do_clone() { TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id); RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path)); } - return _set_tablet_info(is_new_tablet); + + return _set_tablet_info(); } -Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { +Status EngineCloneTask::_set_tablet_info() { // Get clone tablet info TTabletInfo tablet_info; tablet_info.__set_tablet_id(_clone_req.tablet_id); @@ -320,7 +321,7 @@ Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) { // if it is a new tablet and clone failed, then remove the tablet // if it is incremental clone, then must not drop the tablet - if (is_new_tablet) { + if (_is_new_tablet) { // we need to check if this cloned table's version is what we expect. // if not, maybe this is a stale remaining table which is waiting for drop. // we drop it. diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 71dc3a817b8a13..3161b803c82db1 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -55,6 +55,8 @@ class EngineCloneTask final : public EngineTask { std::vector* tablet_infos); ~EngineCloneTask() override = default; + bool is_new_tablet() const { return _is_new_tablet; } + private: Status _do_clone(); @@ -71,7 +73,7 @@ class EngineCloneTask final : public EngineTask { const std::vector& missing_versions, bool* allow_incremental_clone); - Status _set_tablet_info(bool is_new_tablet); + Status _set_tablet_info(); // Download tablet files from Status _download_files(DataDir* data_dir, const std::string& remote_url_prefix, @@ -95,6 +97,7 @@ class EngineCloneTask final : public EngineTask { int64_t _copy_size; int64_t _copy_time_ms; std::vector _pending_rs_guards; + bool _is_new_tablet = false; }; // EngineTask -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp index acdcebae165c6f..45150881423cf1 100644 --- a/be/src/olap/task/engine_publish_version_task.cpp +++ b/be/src/olap/task/engine_publish_version_task.cpp @@ -111,6 +111,20 @@ Status EnginePublishVersionTask::execute() { std::this_thread::sleep_for(std::chrono::milliseconds(wait)); } }); + DBUG_EXECUTE_IF("EnginePublishVersionTask::execute.enable_spin_wait", { + auto token = dp->param("token", "invalid_token"); + while (DebugPoints::instance()->is_enable("EnginePublishVersionTask::execute.block")) { + auto block_dp = DebugPoints::instance()->get_debug_point( + "EnginePublishVersionTask::execute.block"); + if (block_dp) { + auto pass_token = block_dp->param("pass_token", ""); + if (pass_token == token) { + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + }); std::unique_ptr token = _engine.tablet_publish_txn_thread_pool()->new_token( ThreadPool::ExecutionMode::CONCURRENT); std::unordered_map tablet_id_to_num_delta_rows; @@ -342,6 +356,8 @@ void EnginePublishVersionTask::_calculate_tbl_num_delta_rows( auto table_id = tablet->get_table_id(); if (kv.second > 0) { (*_table_id_to_tablet_id_to_num_delta_rows)[table_id][kv.first] += kv.second; + LOG(INFO) << "report delta rows to fe, table_id=" << table_id << ", tablet=" << kv.first + << ", num_rows=" << kv.second; } } } diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index 2ba75c04a3555f..e4a3332ad173cd 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -18,6 +18,7 @@ #include "olap/task/index_builder.h" #include "common/status.h" +#include "gutil/integral_types.h" #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/rowset_writer_context.h" @@ -124,7 +125,28 @@ Status IndexBuilder::update_inverted_index_info() { } } _dropped_inverted_indexes.push_back(*index_meta); + // ATTN: DO NOT REMOVE INDEX AFTER OUTPUT_ROWSET_WRITER CREATED. + // remove dropped index_meta from output rowset tablet schema + output_rs_tablet_schema->remove_index(index_meta->index_id()); } + DBUG_EXECUTE_IF("index_builder.update_inverted_index_info.drop_index", { + auto indexes_count = DebugPoints::instance()->get_debug_param_or_default( + "index_builder.update_inverted_index_info.drop_index", "indexes_count", 0); + if (indexes_count < 0) { + return Status::Error( + "indexes count cannot be negative"); + } + int32_t indexes_size = 0; + for (auto index : output_rs_tablet_schema->indexes()) { + if (index.index_type() == IndexType::INVERTED) { + indexes_size++; + } + } + if (indexes_count != indexes_size) { + return Status::Error( + "indexes count not equal to expected"); + } + }) } else { // base on input rowset's tablet_schema to build // output rowset's tablet_schema which only add diff --git a/be/src/olap/utils.cpp b/be/src/olap/utils.cpp index 019a2f606ce647..aa641207b2329a 100644 --- a/be/src/olap/utils.cpp +++ b/be/src/olap/utils.cpp @@ -537,7 +537,7 @@ bool valid_signed_number(const std::string& value_str) { } bool valid_decimal(const std::string& value_str, const uint32_t precision, const uint32_t frac) { - const char* decimal_pattern = "-?\\d+(.\\d+)?"; + const char* decimal_pattern = "-?(\\d+)(.\\d+)?"; std::regex e(decimal_pattern); std::smatch what; if (!std::regex_match(value_str, what, e) || what[0].str().size() != value_str.size()) { @@ -562,11 +562,14 @@ bool valid_decimal(const std::string& value_str, const uint32_t precision, const fractional_len = number_length - point_pos - 1; } - if (integer_len <= (precision - frac) && fractional_len <= frac) { - return true; - } else { - return false; + /// For value likes "0.xxxxxx", the integer_len should actually be 0. + if (integer_len == 1 && precision - frac == 0) { + if (what[1].str() == "0") { + integer_len = 0; + } } + + return (integer_len <= (precision - frac) && fractional_len <= frac); } bool valid_datetime(const std::string& value_str, const uint32_t scale) { diff --git a/be/src/olap/wal/wal_table.cpp b/be/src/olap/wal/wal_table.cpp index ef98bb58ae48a4..5f1ade097d2a8d 100644 --- a/be/src/olap/wal/wal_table.cpp +++ b/be/src/olap/wal/wal_table.cpp @@ -85,7 +85,6 @@ void WalTable::_pick_relay_wals() { Status WalTable::_relay_wal_one_by_one() { std::vector> need_retry_wals; - std::vector> need_delete_wals; for (auto wal_info : _replaying_queue) { wal_info->add_retry_num(); auto st = _replay_wal_internal(wal_info->get_wal_path()); @@ -95,7 +94,12 @@ Status WalTable::_relay_wal_one_by_one() { msg.find("LabelAlreadyUsedException") != msg.npos) { LOG(INFO) << "succeed to replay wal=" << wal_info->get_wal_path() << ", st=" << st.to_string(); - need_delete_wals.push_back(wal_info); + // delete wal + WARN_IF_ERROR(_exec_env->wal_mgr()->delete_wal(_table_id, wal_info->get_wal_id()), + "failed to delete wal=" + wal_info->get_wal_path()); + if (config::group_commit_wait_replay_wal_finish) { + RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(wal_info->get_wal_id())); + } } else { doris::wal_fail << 1; LOG(WARNING) << "failed to replay wal=" << wal_info->get_wal_path() @@ -110,13 +114,6 @@ Status WalTable::_relay_wal_one_by_one() { _replay_wal_map.emplace(retry_wal_info->get_wal_path(), retry_wal_info); } } - for (auto delete_wal_info : need_delete_wals) { - [[maybe_unused]] auto st = - _exec_env->wal_mgr()->delete_wal(_table_id, delete_wal_info->get_wal_id()); - if (config::group_commit_wait_replay_wal_finish) { - RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(delete_wal_info->get_wal_id())); - } - } return Status::OK(); } diff --git a/be/src/pipeline/common/runtime_filter_consumer.cpp b/be/src/pipeline/common/runtime_filter_consumer.cpp index 0e9c2d0f304c79..817c76a79af47c 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.cpp +++ b/be/src/pipeline/common/runtime_filter_consumer.cpp @@ -52,7 +52,7 @@ Status RuntimeFilterConsumer::_register_runtime_filter(bool need_local_merge) { _runtime_filter_ctxs.reserve(filter_size); _runtime_filter_ready_flag.reserve(filter_size); for (int i = 0; i < filter_size; ++i) { - IRuntimeFilter* runtime_filter = nullptr; + std::shared_ptr runtime_filter; const auto& filter_desc = _runtime_filter_descs[i]; RETURN_IF_ERROR(_state->register_consumer_runtime_filter(filter_desc, need_local_merge, _filter_id, &runtime_filter)); @@ -73,9 +73,9 @@ void RuntimeFilterConsumer::init_runtime_filter_dependency( local_runtime_filter_dependencies; for (size_t i = 0; i < _runtime_filter_descs.size(); ++i) { - IRuntimeFilter* runtime_filter = _runtime_filter_ctxs[i].runtime_filter; + auto runtime_filter = _runtime_filter_ctxs[i].runtime_filter; runtime_filter_dependencies[i] = std::make_shared( - id, node_id, name, runtime_filter); + id, node_id, name, runtime_filter.get()); _runtime_filter_ctxs[i].runtime_filter_dependency = runtime_filter_dependencies[i].get(); runtime_filter_timers[i] = std::make_shared( runtime_filter->registration_time(), runtime_filter->wait_time_ms(), @@ -89,7 +89,7 @@ void RuntimeFilterConsumer::init_runtime_filter_dependency( // The gloabl runtime filter timer need set local runtime filter dependencies. // start to wait before the local runtime filter ready for (size_t i = 0; i < _runtime_filter_descs.size(); ++i) { - IRuntimeFilter* runtime_filter = _runtime_filter_ctxs[i].runtime_filter; + auto runtime_filter = _runtime_filter_ctxs[i].runtime_filter; if (!runtime_filter->has_local_target()) { runtime_filter_timers[i]->set_local_runtime_filter_dependencies( local_runtime_filter_dependencies); @@ -101,43 +101,22 @@ void RuntimeFilterConsumer::init_runtime_filter_dependency( } } -Status RuntimeFilterConsumer::_acquire_runtime_filter(bool pipeline_x) { +Status RuntimeFilterConsumer::_acquire_runtime_filter() { SCOPED_TIMER(_acquire_runtime_filter_timer); std::vector vexprs; for (size_t i = 0; i < _runtime_filter_descs.size(); ++i) { - IRuntimeFilter* runtime_filter = _runtime_filter_ctxs[i].runtime_filter; - if (pipeline_x) { - runtime_filter->update_state(); - if (runtime_filter->is_ready() && !_runtime_filter_ctxs[i].apply_mark) { - // Runtime filter has been applied in open phase. - RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); - _runtime_filter_ctxs[i].apply_mark = true; - } else if (!_runtime_filter_ctxs[i].apply_mark) { - // Runtime filter is timeout. - _is_all_rf_applied = false; - } - } else { - bool ready = runtime_filter->is_ready(); - if (!ready) { - ready = runtime_filter->await(); - } - if (ready && !_runtime_filter_ctxs[i].apply_mark) { - RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); - _runtime_filter_ctxs[i].apply_mark = true; - } else if (runtime_filter->current_state() == RuntimeFilterState::NOT_READY && - !_runtime_filter_ctxs[i].apply_mark) { - *_blocked_by_rf = true; - } else if (!_runtime_filter_ctxs[i].apply_mark) { - DCHECK(runtime_filter->current_state() != RuntimeFilterState::NOT_READY); - _is_all_rf_applied = false; - } + auto runtime_filter = _runtime_filter_ctxs[i].runtime_filter; + runtime_filter->update_state(); + if (runtime_filter->is_ready() && !_runtime_filter_ctxs[i].apply_mark) { + // Runtime filter has been applied in open phase. + RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); + _runtime_filter_ctxs[i].apply_mark = true; + } else if (!_runtime_filter_ctxs[i].apply_mark) { + // Runtime filter is timeout. + _is_all_rf_applied = false; } } RETURN_IF_ERROR(_append_rf_into_conjuncts(vexprs)); - if (!pipeline_x && *_blocked_by_rf) { - return Status::WaitForRf("Runtime filters are neither not ready nor timeout"); - } - return Status::OK(); } diff --git a/be/src/pipeline/common/runtime_filter_consumer.h b/be/src/pipeline/common/runtime_filter_consumer.h index 9bee6053f6f7d5..03868355875454 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.h +++ b/be/src/pipeline/common/runtime_filter_consumer.h @@ -17,6 +17,8 @@ #pragma once +#include + #include "exprs/runtime_filter.h" #include "pipeline/dependency.h" @@ -45,7 +47,7 @@ class RuntimeFilterConsumer { // Register and get all runtime filters at Init phase. Status _register_runtime_filter(bool need_local_merge); // Get all arrived runtime filters at Open phase. - Status _acquire_runtime_filter(bool pipeline_x); + Status _acquire_runtime_filter(); // Append late-arrival runtime filters to the vconjunct_ctx. Status _append_rf_into_conjuncts(const std::vector& vexprs); @@ -55,10 +57,10 @@ class RuntimeFilterConsumer { // For runtime filters struct RuntimeFilterContext { - RuntimeFilterContext(IRuntimeFilter* rf) : runtime_filter(rf) {} + RuntimeFilterContext(std::shared_ptr rf) : runtime_filter(std::move(rf)) {} // set to true if this runtime filter is already applied to vconjunct_ctx_ptr bool apply_mark = false; - IRuntimeFilter* runtime_filter = nullptr; + std::shared_ptr runtime_filter; pipeline::RuntimeFilterDependency* runtime_filter_dependency = nullptr; }; diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 8adc24d3b4ed7c..1e29cf904c759c 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -88,20 +88,11 @@ class Dependency : public std::enable_shared_from_this { public: ENABLE_FACTORY_CREATOR(Dependency); Dependency(int id, int node_id, std::string name) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(false), - _ready(false) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(false) {} Dependency(int id, int node_id, std::string name, bool ready) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(true), - _ready(ready) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} virtual ~Dependency() = default; - bool is_write_dependency() const { return _is_write_dependency; } [[nodiscard]] int id() const { return _id; } [[nodiscard]] virtual std::string name() const { return _name; } BasicSharedState* shared_state() { return _shared_state; } @@ -118,12 +109,10 @@ class Dependency : public std::enable_shared_from_this { // Notify downstream pipeline tasks this dependency is ready. void set_ready(); void set_ready_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); } void set_block_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->block(); } @@ -166,7 +155,6 @@ class Dependency : public std::enable_shared_from_this { const int _id; const int _node_id; const std::string _name; - const bool _is_write_dependency; std::atomic _ready; BasicSharedState* _shared_state = nullptr; @@ -761,8 +749,9 @@ struct SetSharedState : public BasicSharedState { // (select 0) intersect (select null) the build side hash table should not // ignore null value. std::vector data_types; - for (const auto& ctx : child_exprs_lists[0]) { - data_types.emplace_back(build_not_ignore_null[0] + for (int i = 0; i < child_exprs_lists[0].size(); i++) { + const auto& ctx = child_exprs_lists[0][i]; + data_types.emplace_back(build_not_ignore_null[i] ? make_nullable(ctx->root()->data_type()) : ctx->root()->data_type()); } diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 79ca07281d9859..ba93602cb81ef4 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -503,7 +503,8 @@ Status AggSinkLocalState::_execute_with_serialized_key_helper(vectorized::Block* _shared_state->reach_limit = hash_table_size >= (_shared_state->do_sort_limit - ? Base::_parent->template cast()._limit * 5 + ? Base::_parent->template cast()._limit * + config::topn_agg_limit_multiplier : Base::_parent->template cast()._limit); if (_shared_state->reach_limit && _shared_state->do_sort_limit) { _shared_state->build_limit_heap(hash_table_size); @@ -747,7 +748,6 @@ Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { } const auto& agg_functions = tnode.agg_node.aggregate_functions; - _external_agg_bytes_threshold = state->external_agg_bytes_threshold(); _is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(), [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; }); diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h index 10a8119914045c..96f068b6dca02c 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.h +++ b/be/src/pipeline/exec/aggregation_sink_operator.h @@ -189,7 +189,6 @@ class AggSinkOperatorX final : public DataSinkOperatorX { /// The total size of the row from the aggregate functions. size_t _total_size_of_aggregate_states = 0; - size_t _external_agg_bytes_threshold; // group by k1,k2 vectorized::VExprContextSPtrs _probe_expr_ctxs; ObjectPool* _pool = nullptr; diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index 1b7a151e2af7ea..0c05c965f1f884 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -460,6 +460,12 @@ void AggLocalState::do_agg_limit(vectorized::Block* block, bool* eos) { } else { reached_limit(block, eos); } + } else { + if (auto rows = block->rows()) { + _num_rows_returned += rows; + COUNTER_UPDATE(_blocks_returned_counter, 1); + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + } } } diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index a036481d727789..406108fbc4f529 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -279,8 +279,6 @@ void AnalyticLocalState::_destroy_agg_status() { } } -//now is execute for lead/lag row_number/rank/dense_rank/ntile functions -//sum min max count avg first_value last_value functions void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { for (size_t i = 0; i < _agg_functions_size; ++i) { @@ -292,7 +290,7 @@ void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_start, partition_end, frame_start, frame_end, _fn_place_ptr + _parent->cast()._offsets_of_aggregate_states[i], - agg_columns.data(), nullptr); + agg_columns.data(), _agg_arena_pool.get()); // If the end is not greater than the start, the current window should be empty. _current_window_empty = diff --git a/be/src/pipeline/exec/datagen_operator.cpp b/be/src/pipeline/exec/datagen_operator.cpp index 48e428ceef42cf..dae39f179a68f2 100644 --- a/be/src/pipeline/exec/datagen_operator.cpp +++ b/be/src/pipeline/exec/datagen_operator.cpp @@ -86,7 +86,7 @@ Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { // TODO: use runtime filter to filte result block, maybe this node need derive from vscan_node. for (const auto& filter_desc : p._runtime_filter_descs) { - IRuntimeFilter* runtime_filter = nullptr; + std::shared_ptr runtime_filter; RETURN_IF_ERROR(state->register_consumer_runtime_filter( filter_desc, p.ignore_data_distribution(), p.node_id(), &runtime_filter)); runtime_filter->init_profile(_runtime_profile.get()); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 7887628b7fa476..3a55fdd9b8698e 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -567,7 +567,11 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* } else if (!local_state._should_build_hash_table) { DCHECK(_shared_hashtable_controller != nullptr); DCHECK(_shared_hash_table_context != nullptr); - CHECK(_shared_hash_table_context->signaled); + // the instance which is not build hash table, it's should wait the signal of hash table build finished. + // but if it's running and signaled == false, maybe the source operator have closed caused by some short circuit, + if (!_shared_hash_table_context->signaled) { + return Status::Error("source have closed"); + } if (!_shared_hash_table_context->status.ok()) { return _shared_hash_table_context->status; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 374cf506861431..30943b56ff79ca 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -21,6 +21,9 @@ #include "common/logging.h" #include "pipeline/exec/operator.h" +#include "runtime/descriptors.h" +#include "vec/common/assert_cast.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::pipeline { @@ -615,6 +618,54 @@ Status HashJoinProbeOperatorX::prepare(RuntimeState* state) { _left_table_data_types = vectorized::VectorizedUtils::get_data_types(_child_x->row_desc()); _right_table_column_names = vectorized::VectorizedUtils::get_column_names(_build_side_child->row_desc()); + + std::vector slots_to_check; + for (const auto& tuple_descriptor : _intermediate_row_desc->tuple_descriptors()) { + for (const auto& slot : tuple_descriptor->slots()) { + slots_to_check.emplace_back(slot); + } + } + + if (_is_mark_join) { + const auto* last_one = slots_to_check.back(); + slots_to_check.pop_back(); + auto data_type = last_one->get_data_type_ptr(); + if (!data_type->is_nullable()) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + + const auto& null_data_type = assert_cast(*data_type); + if (null_data_type.get_nested_type()->get_type_id() != vectorized::TypeIndex::UInt8) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + } + + const int right_col_idx = + (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); + size_t idx = 0; + for (const auto* slot : slots_to_check) { + auto data_type = slot->get_data_type_ptr(); + auto target_data_type = idx < right_col_idx ? _left_table_data_types[idx] + : _right_table_data_types[idx - right_col_idx]; + ++idx; + if (data_type->equals(*target_data_type)) { + continue; + } + + auto data_type_non_nullable = vectorized::remove_nullable(data_type); + if (data_type_non_nullable->equals(*target_data_type)) { + continue; + } + + return Status::InternalError("intermediate slot({}) data type not match: '{}' vs '{}'", + slot->id(), data_type->get_name(), + _left_table_data_types[idx]->get_name()); + } + _build_side_child.reset(); return Status::OK(); } diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index d43a6d1bf9d6ef..714e0c34190678 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -28,7 +28,9 @@ class JoinBuildSinkLocalState : public PipelineXSinkLocalState public: Status init(RuntimeState* state, LocalSinkStateInfo& info) override; - const std::vector& runtime_filters() const { return _runtime_filters; } + const std::vector>& runtime_filters() const { + return _runtime_filters; + } protected: JoinBuildSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) @@ -41,7 +43,7 @@ class JoinBuildSinkLocalState : public PipelineXSinkLocalState RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; - std::vector _runtime_filters; + std::vector> _runtime_filters; }; template diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index 25bc28b5d432f0..1028bca7ce2ca4 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -51,7 +51,7 @@ Status MultiCastDataStreamSourceLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); - RETURN_IF_ERROR(_acquire_runtime_filter(true)); + RETURN_IF_ERROR(_acquire_runtime_filter()); auto& p = _parent->cast(); _output_expr_contexts.resize(p._output_expr_contexts.size()); for (size_t i = 0; i < p._output_expr_contexts.size(); i++) { diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.cpp b/be/src/pipeline/exec/multi_cast_data_streamer.cpp index deebf7d11bb2e2..d44cf3974a6275 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.cpp +++ b/be/src/pipeline/exec/multi_cast_data_streamer.cpp @@ -23,63 +23,97 @@ namespace doris::pipeline { -MultiCastBlock::MultiCastBlock(vectorized::Block* block, int used_count, size_t mem_size) - : _used_count(used_count), _mem_size(mem_size) { +MultiCastBlock::MultiCastBlock(vectorized::Block* block, int used_count, int un_finish_copy, + size_t mem_size) + : _used_count(used_count), _un_finish_copy(un_finish_copy), _mem_size(mem_size) { _block = vectorized::Block::create_unique(block->get_columns_with_type_and_name()); block->clear(); } Status MultiCastDataStreamer::pull(int sender_idx, doris::vectorized::Block* block, bool* eos) { - std::lock_guard l(_mutex); - auto& pos_to_pull = _sender_pos_to_read[sender_idx]; - if (pos_to_pull != _multi_cast_blocks.end()) { - if (pos_to_pull->_used_count == 1) { - DCHECK(pos_to_pull == _multi_cast_blocks.begin()); - pos_to_pull->_block->swap(*block); - - _cumulative_mem_size -= pos_to_pull->_mem_size; - pos_to_pull++; - _multi_cast_blocks.pop_front(); - } else { - pos_to_pull->_block->create_same_struct_block(0)->swap(*block); - RETURN_IF_ERROR(vectorized::MutableBlock(block).merge(*pos_to_pull->_block)); - pos_to_pull->_used_count--; - pos_to_pull++; + int* un_finish_copy = nullptr; + int use_count = 0; + { + std::lock_guard l(_mutex); + auto& pos_to_pull = _sender_pos_to_read[sender_idx]; + const auto end = _multi_cast_blocks.end(); + DCHECK(pos_to_pull != end); + + *block = *pos_to_pull->_block; + + _cumulative_mem_size -= pos_to_pull->_mem_size; + + pos_to_pull->_used_count--; + use_count = pos_to_pull->_used_count; + un_finish_copy = &pos_to_pull->_un_finish_copy; + + pos_to_pull++; + + if (pos_to_pull == end) { + _block_reading(sender_idx); } + + *eos = _eos and pos_to_pull == end; } - *eos = _eos and pos_to_pull == _multi_cast_blocks.end(); - if (pos_to_pull == _multi_cast_blocks.end()) { - _block_reading(sender_idx); + + if (use_count == 0) { + // will clear _multi_cast_blocks + _wait_copy_block(block, *un_finish_copy); + } else { + _copy_block(block, *un_finish_copy); } + return Status::OK(); } +void MultiCastDataStreamer::_copy_block(vectorized::Block* block, int& un_finish_copy) { + const auto rows = block->rows(); + for (int i = 0; i < block->columns(); ++i) { + block->get_by_position(i).column = block->get_by_position(i).column->clone_resized(rows); + } + + std::unique_lock l(_mutex); + un_finish_copy--; + if (un_finish_copy == 0) { + l.unlock(); + _cv.notify_one(); + } +} + +void MultiCastDataStreamer::_wait_copy_block(vectorized::Block* block, int& un_finish_copy) { + std::unique_lock l(_mutex); + _cv.wait(l, [&]() { return un_finish_copy == 0; }); + _multi_cast_blocks.pop_front(); +} + Status MultiCastDataStreamer::push(RuntimeState* state, doris::vectorized::Block* block, bool eos) { auto rows = block->rows(); COUNTER_UPDATE(_process_rows, rows); - auto block_mem_size = block->allocated_bytes(); - std::lock_guard l(_mutex); - int need_process_count = _cast_sender_count - _closed_sender_count; - if (need_process_count == 0) { - return Status::EndOfFile("All data streamer is EOF"); - } - // TODO: if the [queue back block rows + block->rows()] < batch_size, better - // do merge block. but need check the need_process_count and used_count whether - // equal - _multi_cast_blocks.emplace_back(block, need_process_count, block_mem_size); + const auto block_mem_size = block->allocated_bytes(); _cumulative_mem_size += block_mem_size; COUNTER_SET(_peak_mem_usage, std::max(_cumulative_mem_size, _peak_mem_usage->value())); - auto end = _multi_cast_blocks.end(); - end--; - for (int i = 0; i < _sender_pos_to_read.size(); ++i) { - if (_sender_pos_to_read[i] == _multi_cast_blocks.end()) { - _sender_pos_to_read[i] = end; - _set_ready_for_read(i); + { + std::lock_guard l(_mutex); + _multi_cast_blocks.emplace_back(block, _cast_sender_count, _cast_sender_count - 1, + block_mem_size); + // last elem + auto end = std::prev(_multi_cast_blocks.end()); + for (int i = 0; i < _sender_pos_to_read.size(); ++i) { + if (_sender_pos_to_read[i] == _multi_cast_blocks.end()) { + _sender_pos_to_read[i] = end; + _set_ready_for_read(i); + } + } + _eos = eos; + } + + if (_eos) { + for (auto* read_dep : _dependencies) { + read_dep->set_always_ready(); } } - _eos = eos; return Status::OK(); } @@ -92,13 +126,6 @@ void MultiCastDataStreamer::_set_ready_for_read(int sender_idx) { dep->set_ready(); } -void MultiCastDataStreamer::_set_ready_for_read() { - for (auto* dep : _dependencies) { - DCHECK(dep); - dep->set_ready(); - } -} - void MultiCastDataStreamer::_block_reading(int sender_idx) { if (_dependencies.empty()) { return; diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.h b/be/src/pipeline/exec/multi_cast_data_streamer.h index 2112ebaaf205b1..07e64016363f65 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.h +++ b/be/src/pipeline/exec/multi_cast_data_streamer.h @@ -23,10 +23,11 @@ namespace doris::pipeline { class Dependency; struct MultiCastBlock { - MultiCastBlock(vectorized::Block* block, int used_count, size_t mem_size); + MultiCastBlock(vectorized::Block* block, int used_count, int need_copy, size_t mem_size); std::unique_ptr _block; int _used_count; + int _un_finish_copy; size_t _mem_size; }; @@ -58,12 +59,6 @@ class MultiCastDataStreamer { RuntimeProfile* profile() { return _profile; } - void set_eos() { - std::lock_guard l(_mutex); - _eos = true; - _set_ready_for_read(); - } - void set_dep_by_sender_idx(int sender_idx, Dependency* dep) { _dependencies[sender_idx] = dep; _block_reading(sender_idx); @@ -71,17 +66,20 @@ class MultiCastDataStreamer { private: void _set_ready_for_read(int sender_idx); - void _set_ready_for_read(); void _block_reading(int sender_idx); + void _copy_block(vectorized::Block* block, int& un_finish_copy); + + void _wait_copy_block(vectorized::Block* block, int& un_finish_copy); + const RowDescriptor& _row_desc; RuntimeProfile* _profile = nullptr; std::list _multi_cast_blocks; std::vector::iterator> _sender_pos_to_read; + std::condition_variable _cv; std::mutex _mutex; bool _eos = false; int _cast_sender_count = 0; - int _closed_sender_count = 0; int64_t _cumulative_mem_size = 0; RuntimeProfile::Counter* _process_rows = nullptr; diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index bee550f1db5291..95bfd3980417b9 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -129,6 +129,8 @@ Status OlapScanLocalState::_init_profile() { _inverted_index_query_cache_miss_counter = ADD_COUNTER(_segment_profile, "InvertedIndexQueryCacheMiss", TUnit::UNIT); _inverted_index_query_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryTime"); + _inverted_index_query_null_bitmap_timer = + ADD_TIMER(_segment_profile, "InvertedIndexQueryNullBitmapTime"); _inverted_index_query_bitmap_copy_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); _inverted_index_query_bitmap_op_timer = @@ -137,6 +139,10 @@ Status OlapScanLocalState::_init_profile() { ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); _inverted_index_searcher_search_timer = ADD_TIMER(_segment_profile, "InvertedIndexSearcherSearchTime"); + _inverted_index_searcher_cache_hit_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheHit", TUnit::UNIT); + _inverted_index_searcher_cache_miss_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheMiss", TUnit::UNIT); _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index ca98b17118999f..83f838dd0fc47c 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -174,6 +174,7 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_filter_counter = nullptr; RuntimeProfile::Counter* _inverted_index_filter_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_null_bitmap_timer = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_hit_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; @@ -181,6 +182,8 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_hit_counter = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_miss_counter = nullptr; RuntimeProfile::Counter* _output_index_result_column_timer = nullptr; diff --git a/be/src/pipeline/exec/operator.cpp b/be/src/pipeline/exec/operator.cpp index ba3602a91cb1ef..0928b32f41d8aa 100644 --- a/be/src/pipeline/exec/operator.cpp +++ b/be/src/pipeline/exec/operator.cpp @@ -75,6 +75,7 @@ #include "pipeline/local_exchange/local_exchange_source_operator.h" #include "util/debug_util.h" #include "util/runtime_profile.h" +#include "util/string_util.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" #include "vec/utils/util.hpp" @@ -260,7 +261,7 @@ Status OperatorXBase::do_projections(RuntimeState* state, vectorized::Block* ori vectorized::Block input_block = *origin_block; std::vector result_column_ids; - for (const auto& projections : _intermediate_projections) { + for (const auto& projections : local_state->_intermediate_projections) { result_column_ids.resize(projections.size()); for (int i = 0; i < projections.size(); i++) { RETURN_IF_ERROR(projections[i]->execute(&input_block, &result_column_ids[i])); @@ -334,16 +335,22 @@ Status OperatorXBase::get_block_after_projects(RuntimeState* state, vectorized:: return get_block(state, block, eos); } -bool PipelineXLocalStateBase::reached_limit() const { - return _parent->_limit != -1 && _num_rows_returned >= _parent->_limit; -} - void PipelineXLocalStateBase::reached_limit(vectorized::Block* block, bool* eos) { if (_parent->_limit != -1 and _num_rows_returned + block->rows() >= _parent->_limit) { block->set_num_rows(_parent->_limit - _num_rows_returned); *eos = true; } + DBUG_EXECUTE_IF("Pipeline::reached_limit_early", { + auto op_name = to_lower(_parent->_op_name); + auto arg_op_name = dp->param("op_name"); + arg_op_name = to_lower(arg_op_name); + + if (op_name == arg_op_name) { + *eos = true; + } + }); + if (auto rows = block->rows()) { _num_rows_returned += rows; COUNTER_UPDATE(_blocks_returned_counter, 1); diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index 8abf7ab31da9eb..2db981ba88e80b 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -152,7 +152,6 @@ class PipelineXLocalStateBase { // If use projection, we should clear `_origin_block`. void clear_origin_block(); - [[nodiscard]] bool reached_limit() const; void reached_limit(vectorized::Block* block, bool* eos); RuntimeProfile* profile() { return _runtime_profile.get(); } diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp index 4399f3c7045c00..a70718e7763275 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp @@ -210,7 +210,7 @@ Status PartitionedAggSinkLocalState::setup_in_memory_agg_op(RuntimeState* state) _runtime_state->set_be_number(state->be_number()); _runtime_state->set_desc_tbl(&state->desc_tbl()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); _runtime_state->set_task_id(state->task_id()); auto& parent = Base::_parent->template cast(); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp index a8c4e7b0bcc53e..153676851ac92c 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp @@ -172,7 +172,7 @@ Status PartitionedAggLocalState::setup_in_memory_agg_op(RuntimeState* state) { _runtime_state->set_desc_tbl(&state->desc_tbl()); _runtime_state->resize_op_id_to_local_state(state->max_operator_id()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); auto& parent = Base::_parent->template cast(); diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp index 8118b669ef84bc..6dc1616e0eb689 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp @@ -625,8 +625,7 @@ Status PartitionedHashJoinProbeOperatorX::_setup_internal_operators( local_state._runtime_state->set_desc_tbl(&state->desc_tbl()); local_state._runtime_state->resize_op_id_to_local_state(-1); - local_state._runtime_state->set_pipeline_x_runtime_filter_mgr( - state->local_runtime_filter_mgr()); + local_state._runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); local_state._in_mem_shared_state_sptr = _inner_sink_operator->create_shared_state(); diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index 1aeb9213d83ee7..fc17ef41be62c8 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -447,7 +447,7 @@ Status PartitionedHashJoinSinkOperatorX::_setup_internal_operator(RuntimeState* local_state._shared_state->inner_runtime_state->set_desc_tbl(&state->desc_tbl()); local_state._shared_state->inner_runtime_state->resize_op_id_to_local_state(-1); - local_state._shared_state->inner_runtime_state->set_pipeline_x_runtime_filter_mgr( + local_state._shared_state->inner_runtime_state->set_runtime_filter_mgr( state->local_runtime_filter_mgr()); local_state._shared_state->inner_shared_state = std::dynamic_pointer_cast( diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index 0495e48b7dc926..73d0bea8f99d26 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/config.h" #include "common/object_pool.h" #include "exec/rowid_fetcher.h" #include "pipeline/exec/operator.h" @@ -48,9 +49,10 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { + auto& p = _parent->cast(); RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( - fragment_instance_id, RESULT_SINK_BUFFER_SIZE, &_sender, state->execution_timeout(), - state->batch_size())); + fragment_instance_id, p._result_sink_buffer_size_rows, &_sender, + state->execution_timeout(), state->batch_size())); } _sender->set_dependency(fragment_instance_id, _dependency->shared_from_this()); return Status::OK(); @@ -80,7 +82,12 @@ Status ResultSinkLocalState::open(RuntimeState* state) { case TResultSinkType::ARROW_FLIGHT_PROTOCAL: { std::shared_ptr arrow_schema; RETURN_IF_ERROR(convert_expr_ctxs_arrow_schema(_output_vexpr_ctxs, &arrow_schema)); - state->exec_env()->result_mgr()->register_arrow_schema(state->query_id(), arrow_schema); + if (state->query_options().enable_parallel_result_sink) { + state->exec_env()->result_mgr()->register_arrow_schema(state->query_id(), arrow_schema); + } else { + state->exec_env()->result_mgr()->register_arrow_schema(state->fragment_instance_id(), + arrow_schema); + } _writer.reset(new (std::nothrow) vectorized::VArrowFlightResultWriter( _sender.get(), _output_vexpr_ctxs, _profile, arrow_schema)); break; @@ -102,6 +109,11 @@ ResultSinkOperatorX::ResultSinkOperatorX(int operator_id, const RowDescriptor& r } else { _sink_type = sink.type; } + if (_sink_type == TResultSinkType::ARROW_FLIGHT_PROTOCAL) { + _result_sink_buffer_size_rows = config::arrow_flight_result_sink_buffer_size_rows; + } else { + _result_sink_buffer_size_rows = RESULT_SINK_BUFFER_SIZE; + } _fetch_option = sink.fetch_option; _name = "ResultSink"; } @@ -121,8 +133,8 @@ Status ResultSinkOperatorX::prepare(RuntimeState* state) { if (state->query_options().enable_parallel_result_sink) { RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( - state->query_id(), RESULT_SINK_BUFFER_SIZE, &_sender, state->execution_timeout(), - state->batch_size())); + state->query_id(), _result_sink_buffer_size_rows, &_sender, + state->execution_timeout(), state->batch_size())); } return Status::OK(); } diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 7ec7d43ec2b03a..06b961b2a31694 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -152,6 +152,7 @@ class ResultSinkOperatorX final : public DataSinkOperatorX Status _second_phase_fetch_data(RuntimeState* state, vectorized::Block* final_block); TResultSinkType::type _sink_type; + int _result_sink_buffer_size_rows; // set file options when sink type is FILE std::unique_ptr _file_opts = nullptr; diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index a72514fd0b11f4..e7515a30a07487 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -102,7 +102,7 @@ Status ScanLocalState::open(RuntimeState* state) { RETURN_IF_ERROR( p._common_expr_ctxs_push_down[i]->clone(state, _common_expr_ctxs_push_down[i])); } - RETURN_IF_ERROR(_acquire_runtime_filter(true)); + RETURN_IF_ERROR(_acquire_runtime_filter()); _stale_expr_ctxs.resize(p._stale_expr_ctxs.size()); for (size_t i = 0; i < _stale_expr_ctxs.size(); i++) { RETURN_IF_ERROR(p._stale_expr_ctxs[i]->clone(state, _stale_expr_ctxs[i])); @@ -1100,18 +1100,12 @@ Status ScanLocalState::_normalize_in_and_not_in_compound_predicate( auto hybrid_set = expr->get_set_func(); if (hybrid_set != nullptr) { - if (hybrid_set->size() <= - _parent->cast()._max_pushdown_conditions_per_column) { - iter = hybrid_set->begin(); - } else { - _filter_predicates.in_filters.emplace_back(slot->col_name(), expr->get_set_func()); - *pdt = PushDownType::ACCEPTABLE; - return Status::OK(); - } + *pdt = PushDownType::UNACCEPTABLE; + return Status::OK(); } else { - vectorized::VInPredicate* pred = static_cast(expr); + auto* pred = static_cast(expr); - vectorized::InState* state = reinterpret_cast( + auto* state = reinterpret_cast( expr_ctx->fn_context(pred->fn_context_index()) ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); @@ -1120,6 +1114,11 @@ Status ScanLocalState::_normalize_in_and_not_in_compound_predicate( } iter = state->hybrid_set->begin(); + + if (state->hybrid_set->contain_null()) { + *pdt = PushDownType::UNACCEPTABLE; + return Status::OK(); + } } while (iter->has_next()) { @@ -1224,9 +1223,8 @@ Status ScanLocalState::_start_scanners( state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(), state()->scan_queue_mem_limit(), _scan_dependency, // 1. If data distribution is ignored , we use 1 instance to scan. - // 2. Else if this operator is not file scan operator, we use config::doris_scanner_thread_pool_thread_num scanners to scan. - // 3. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. - p.ignore_data_distribution() || !p.is_file_scan_operator() + // 2. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. + p.ignore_data_distribution() && !p.is_file_scan_operator() ? 1 : state()->query_parallel_instance_num()); return Status::OK(); diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index f46589880958ee..8ff0a570e3393e 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -48,6 +48,7 @@ Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { // new one scanner _schema_scanner = SchemaScanner::create(schema_table->schema_table_type()); + _schema_scanner->set_dependency(_data_dependency, _finish_dependency); if (nullptr == _schema_scanner) { return Status::InternalError("schema scanner get nullptr pointer."); } @@ -59,7 +60,7 @@ Status SchemaScanLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXLocalState<>::open(state)); - return _schema_scanner->start(state); + return _schema_scanner->get_next_block_async(state); } SchemaScanOperatorX::SchemaScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, @@ -119,6 +120,17 @@ Status SchemaScanOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { _common_scanner_param->catalog = state->obj_pool()->add(new std::string(tnode.schema_scan_node.catalog)); } + + if (tnode.schema_scan_node.__isset.fe_addr_list) { + for (const auto& fe_addr : tnode.schema_scan_node.fe_addr_list) { + _common_scanner_param->fe_addr_list.insert(fe_addr); + } + } else if (tnode.schema_scan_node.__isset.ip && tnode.schema_scan_node.__isset.port) { + TNetworkAddress fe_addr; + fe_addr.hostname = tnode.schema_scan_node.ip; + fe_addr.port = tnode.schema_scan_node.port; + _common_scanner_param->fe_addr_list.insert(fe_addr); + } return Status::OK(); } @@ -226,8 +238,12 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl while (true) { RETURN_IF_CANCELLED(state); + if (local_state._data_dependency->is_blocked_by() != nullptr) { + break; + } // get all slots from schema table. - RETURN_IF_ERROR(local_state._schema_scanner->get_next_block(&src_block, &schema_eos)); + RETURN_IF_ERROR( + local_state._schema_scanner->get_next_block(state, &src_block, &schema_eos)); if (schema_eos) { *eos = true; diff --git a/be/src/pipeline/exec/schema_scan_operator.h b/be/src/pipeline/exec/schema_scan_operator.h index 8f2b73f5123f0d..aa2bff7e6440a2 100644 --- a/be/src/pipeline/exec/schema_scan_operator.h +++ b/be/src/pipeline/exec/schema_scan_operator.h @@ -35,18 +35,30 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { ENABLE_FACTORY_CREATOR(SchemaScanLocalState); SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState<>(state, parent) {} + : PipelineXLocalState<>(state, parent) { + _finish_dependency = + std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_FINISH_DEPENDENCY", true); + _data_dependency = std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_DEPENDENCY", true); + } ~SchemaScanLocalState() override = default; Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; + Dependency* finishdependency() override { return _finish_dependency.get(); } + std::vector dependencies() const override { return {_data_dependency.get()}; } + private: friend class SchemaScanOperatorX; SchemaScannerParam _scanner_param; std::unique_ptr _schema_scanner; + + std::shared_ptr _finish_dependency; + std::shared_ptr _data_dependency; }; class SchemaScanOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 5fc38f3ca706ac..6c76f9a57a3ee2 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -130,9 +130,13 @@ Status SetSinkOperatorX::_extract_build_column( block.get_by_position(result_col_id).column = block.get_by_position(result_col_id).column->convert_to_full_column_if_const(); } + // Do make nullable should not change the origin column and type in origin block + // which may cause coredump problem if (local_state._shared_state->build_not_ignore_null[i]) { - block.get_by_position(result_col_id).column = - make_nullable(block.get_by_position(result_col_id).column); + auto column_ptr = make_nullable(block.get_by_position(result_col_id).column, false); + block.insert( + {column_ptr, make_nullable(block.get_by_position(result_col_id).type), ""}); + result_col_id = block.columns() - 1; } raw_ptrs[i] = block.get_by_position(result_col_id).column.get(); diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.cpp b/be/src/pipeline/exec/spill_sort_sink_operator.cpp index 4c6eb290ef11f3..94196a0354e5cf 100644 --- a/be/src/pipeline/exec/spill_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_sink_operator.cpp @@ -84,7 +84,7 @@ Status SpillSortSinkLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state->set_be_number(state->be_number()); _runtime_state->set_desc_tbl(&state->desc_tbl()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); auto& parent = Base::_parent->template cast(); Base::_shared_state->in_mem_shared_state_sptr = diff --git a/be/src/pipeline/exec/spill_sort_source_operator.cpp b/be/src/pipeline/exec/spill_sort_source_operator.cpp index 72304291f6dcfe..967e13d1fa527b 100644 --- a/be/src/pipeline/exec/spill_sort_source_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_source_operator.cpp @@ -219,7 +219,7 @@ Status SpillSortLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state->set_desc_tbl(&state->desc_tbl()); _runtime_state->resize_op_id_to_local_state(state->max_operator_id()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); DCHECK(_shared_state->in_mem_shared_state); LocalStateInfo state_info { diff --git a/be/src/pipeline/exec/spill_utils.h b/be/src/pipeline/exec/spill_utils.h index f2f19512cbd2fc..635a6a6bbbcf8a 100644 --- a/be/src/pipeline/exec/spill_utils.h +++ b/be/src/pipeline/exec/spill_utils.h @@ -33,8 +33,6 @@ class SpillRunnable : public Runnable { SpillRunnable(RuntimeState* state, const std::shared_ptr& shared_state, std::function func) : _state(state), - _mem_tracker(state->get_query_ctx()->query_mem_tracker), - _task_id(state->query_id()), _task_context_holder(state->get_task_execution_context()), _shared_state_holder(shared_state), _func(std::move(func)) {} @@ -42,7 +40,7 @@ class SpillRunnable : public Runnable { ~SpillRunnable() override = default; void run() override { - SCOPED_ATTACH_TASK_WITH_ID(_mem_tracker, _task_id); + SCOPED_ATTACH_TASK(_state); Defer defer([&] { std::function tmp; std::swap(tmp, _func); @@ -66,8 +64,6 @@ class SpillRunnable : public Runnable { private: RuntimeState* _state; - std::shared_ptr _mem_tracker; - TUniqueId _task_id; std::weak_ptr _task_context_holder; std::weak_ptr _shared_state_holder; std::function _func; diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 0ff1df260012b7..faa48d209f4b1e 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -56,6 +56,8 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp index 56f0a157cdee8d..6b0cca2d71a969 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp @@ -75,11 +75,13 @@ std::string LocalExchangeSourceLocalState::debug_string(int indentation_level) c fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "{}, _channel_id: {}, _num_partitions: {}, _num_senders: {}, _num_sources: {}, " - "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}", + "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}, " + "data queue info: {}", Base::debug_string(indentation_level), _channel_id, _exchanger->_num_partitions, _exchanger->_num_senders, _exchanger->_num_sources, _exchanger->_running_sink_operators, _exchanger->_running_source_operators, - _shared_state->mem_usage.load()); + _shared_state->mem_usage.load(), + _exchanger->data_queue_debug_string(_channel_id)); size_t i = 0; fmt::format_to(debug_string_buffer, ", MemTrackers: "); for (auto* mem_tracker : _shared_state->mem_trackers) { diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/local_exchange/local_exchange_source_operator.h index f9fa4cfa4edfe3..d2f68d4ebaca31 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.h @@ -51,6 +51,8 @@ class LocalExchangeSourceLocalState final : public PipelineXLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; int _channel_id; diff --git a/be/src/pipeline/local_exchange/local_exchanger.cpp b/be/src/pipeline/local_exchange/local_exchanger.cpp index 27b7fc7e7fd3f7..647ddcfba2d87e 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/local_exchange/local_exchanger.cpp @@ -26,6 +26,37 @@ namespace doris::pipeline { +template +bool Exchanger::_enqueue_data_and_set_ready(int channel_id, + LocalExchangeSinkLocalState& local_state, + BlockType&& block) { + std::unique_lock l(_m); + if (_data_queue[channel_id].enqueue(std::move(block))) { + local_state._shared_state->set_ready_to_read(channel_id); + return true; + } + return false; +} + +template +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, + BlockType& block, bool* eos) { + bool all_finished = _running_sink_operators == 0; + if (_data_queue[local_state._channel_id].try_dequeue(block)) { + return true; + } else if (all_finished) { + *eos = true; + } else { + std::unique_lock l(_m); + if (_data_queue[local_state._channel_id].try_dequeue(block)) { + return true; + } + COUNTER_UPDATE(local_state._get_block_failed_counter, 1); + local_state._dependency->block(); + } + return false; +} + Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) { { @@ -74,17 +105,11 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block return Status::OK(); }; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(partitioned_block)) { + if (_dequeue_data(local_state, partitioned_block, eos)) { SCOPED_TIMER(local_state._copy_data_timer); mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); RETURN_IF_ERROR(get_data(block)); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -92,7 +117,6 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -135,9 +159,9 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( it.second, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[it.second].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(it.second); - } else { + + if (!_enqueue_data_and_set_ready(it.second, local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( it.second, new_block_wrapper->data_block.allocated_bytes(), false); new_block_wrapper->unref(local_state._shared_state); @@ -154,10 +178,8 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[i % _num_sources].enqueue( - {new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(i % _num_sources); - } else { + if (!_enqueue_data_and_set_ready(i % _num_sources, local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), false); @@ -177,9 +199,8 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( map[i], new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[map[i]].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(map[i]); - } else { + if (!_enqueue_data_and_set_ready(map[i], local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( map[i], new_block_wrapper->data_block.allocated_bytes(), false); new_block_wrapper->unref(local_state._shared_state); @@ -203,9 +224,7 @@ Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_blo auto channel_id = (local_state._channel_id++) % _num_partitions; size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { + if (!_enqueue_data_and_set_ready(channel_id, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(channel_id, memory_usage); } @@ -224,19 +243,13 @@ void PassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { block->swap(next_block); local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); if (_free_block_limit == 0 || _free_blocks.size_approx() < _free_block_limit * _num_sources) { _free_blocks.enqueue(std::move(next_block)); } - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -245,9 +258,7 @@ Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block LocalExchangeSinkLocalState& local_state) { vectorized::Block new_block(in_block->clone_empty()); new_block.swap(*in_block); - if (_data_queue[0].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(0); - } + _enqueue_data_and_set_ready(0, local_state, std::move(new_block)); return Status::OK(); } @@ -259,14 +270,8 @@ Status PassToOneExchanger::get_block(RuntimeState* state, vectorized::Block* blo return Status::OK(); } vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[0].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -283,9 +288,7 @@ Status LocalMergeSortExchanger::sink(RuntimeState* state, vectorized::Block* in_ size_t memory_usage = new_block.allocated_bytes(); add_mem_usage(local_state, memory_usage); - if (_data_queue[local_state._channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(0); - } else { + if (!_enqueue_data_and_set_ready(local_state._channel_id, local_state, std::move(new_block))) { sub_mem_usage(local_state, memory_usage); } if (eos) { @@ -402,9 +405,7 @@ Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block for (size_t i = 0; i < _num_partitions; i++) { auto mutable_block = vectorized::MutableBlock::create_unique(in_block->clone_empty()); RETURN_IF_ERROR(mutable_block->add_rows(in_block, 0, in_block->rows())); - if (_data_queue[i].enqueue(mutable_block->to_block())) { - local_state._shared_state->set_ready_to_read(i); - } + _enqueue_data_and_set_ready(i, local_state, mutable_block->to_block()); } return Status::OK(); @@ -421,14 +422,8 @@ void BroadcastExchanger::close(LocalExchangeSourceLocalState& local_state) { Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -444,9 +439,8 @@ Status AdaptivePassthroughExchanger::_passthrough_sink(RuntimeState* state, auto channel_id = (local_state._channel_id++) % _num_partitions; size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { + + if (!_enqueue_data_and_set_ready(channel_id, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(channel_id, memory_usage); } @@ -477,7 +471,6 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -506,9 +499,7 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(i, memory_usage); - if (data_queue[i].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(i); - } else { + if (!_enqueue_data_and_set_ready(i, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(i, memory_usage); } } @@ -532,19 +523,13 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, vectorized:: bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { block->swap(next_block); if (_free_block_limit == 0 || _free_blocks.size_approx() < _free_block_limit * _num_sources) { _free_blocks.enqueue(std::move(next_block)); } local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index 2c4f8f5b78509e..afdebd21101f9a 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -55,6 +55,8 @@ class ExchangerBase { virtual std::vector local_sink_state_dependency(int channel_id) { return {}; } virtual std::vector local_state_dependency(int channel_id) { return {}; } + virtual std::string data_queue_debug_string(int i) = 0; + protected: friend struct LocalExchangeSharedState; friend struct ShuffleBlockWrapper; @@ -115,9 +117,19 @@ class Exchanger : public ExchangerBase { : ExchangerBase(running_sink_operators, num_sources, num_partitions, free_block_limit) { } ~Exchanger() override = default; + std::string data_queue_debug_string(int i) override { + return fmt::format("Data Queue {}: [size approx = {}, eos = {}]", i, + _data_queue[i].data_queue.size_approx(), _data_queue[i].eos); + } protected: + bool _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, + BlockType&& block); + bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos); std::vector> _data_queue; + +private: + std::mutex _m; }; class LocalExchangeSourceLocalState; diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index 8138c7594b8cc5..c4a2073c911097 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -124,12 +124,11 @@ PipelineFragmentContext::PipelineFragmentContext( _is_report_on_cancel(true), _report_status_cb(report_status_cb) { _fragment_watcher.start(); - _query_thread_context = {query_id, _query_ctx->query_mem_tracker}; } PipelineFragmentContext::~PipelineFragmentContext() { // The memory released by the query end is recorded in the query mem tracker. - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_thread_context.query_mem_tracker); + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_ctx->query_mem_tracker); auto st = _query_ctx->exec_status(); _query_ctx.reset(); for (size_t i = 0; i < _tasks.size(); i++) { @@ -393,7 +392,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks( runtime_state->set_total_load_streams(request.total_load_streams); runtime_state->set_num_local_sink(request.num_local_sink); DCHECK(runtime_filter_mgr); - runtime_state->set_pipeline_x_runtime_filter_mgr(runtime_filter_mgr.get()); + runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); }; auto filterparams = std::make_unique(); @@ -884,8 +883,12 @@ Status PipelineFragmentContext::_plan_local_exchange( } } + // if 'num_buckets == 0' means the fragment is colocated by exchange node not the + // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 + // still keep colocate plan after local shuffle RETURN_IF_ERROR(_plan_local_exchange( - _pipelines[pip_idx]->operator_xs().front()->ignore_data_hash_distribution() + _pipelines[pip_idx]->operator_xs().front()->ignore_data_hash_distribution() || + num_buckets == 0 ? _num_instances : num_buckets, pip_idx, _pipelines[pip_idx], bucket_seq_to_instance_idx, @@ -1178,11 +1181,19 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo return Status::InternalError("Illegal aggregate node " + std::to_string(tnode.node_id) + ": group by and output is empty"); } - if (tnode.agg_node.aggregate_functions.empty() && !_runtime_state->enable_agg_spill() && + + const bool group_by_limit_opt = + tnode.agg_node.__isset.agg_sort_info_by_group_key && tnode.limit > 0; + + /// PartitionedAggSourceOperatorX does not support "group by limit opt(#29641)" yet. + /// If `group_by_limit_opt` is true, then it might not need to spill at all. + const bool enable_spill = _runtime_state->enable_agg_spill() && + !tnode.agg_node.grouping_exprs.empty() && !group_by_limit_opt; + + if (tnode.agg_node.aggregate_functions.empty() && !enable_spill && request.query_options.__isset.enable_distinct_streaming_aggregation && request.query_options.enable_distinct_streaming_aggregation && - !tnode.agg_node.grouping_exprs.empty() && - !tnode.agg_node.__isset.agg_sort_info_by_group_key) { + !tnode.agg_node.grouping_exprs.empty() && !group_by_limit_opt) { op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, _require_bucket_distribution)); _require_bucket_distribution = @@ -1194,7 +1205,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); RETURN_IF_ERROR(cur_pipe->add_operator(op)); } else { - if (_runtime_state->enable_agg_spill() && !tnode.agg_node.grouping_exprs.empty()) { + if (enable_spill) { op.reset(new PartitionedAggSourceOperatorX(pool, tnode, next_operator_id(), descs)); } else { op.reset(new AggSourceOperatorX(pool, tnode, next_operator_id(), descs)); @@ -1209,7 +1220,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(cur_pipe->id()); DataSinkOperatorXPtr sink; - if (_runtime_state->enable_agg_spill() && !tnode.agg_node.grouping_exprs.empty()) { + if (enable_spill) { sink.reset(new PartitionedAggSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, _require_bucket_distribution)); } else { diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h index 3b6c73dbef4413..7597c3ce9b55d0 100644 --- a/be/src/pipeline/pipeline_fragment_context.h +++ b/be/src/pipeline/pipeline_fragment_context.h @@ -214,8 +214,6 @@ class PipelineFragmentContext : public TaskExecutionContext { std::shared_ptr _query_ctx; - QueryThreadContext _query_thread_context; - MonotonicStopWatch _fragment_watcher; RuntimeProfile::Counter* _prepare_timer = nullptr; RuntimeProfile::Counter* _init_context_timer = nullptr; diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index 80b23d9401138c..da581629e21940 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -120,6 +120,9 @@ Status PipelineTask::prepare(const TPipelineInstanceParams& local_params, const std::unique_lock lc(_dependency_lock); filter_dependencies.swap(_filter_dependencies); } + if (query_context()->is_cancelled()) { + clear_blocking_state(); + } return Status::OK(); } @@ -150,8 +153,6 @@ Status PipelineTask::_extract_dependencies() { { auto* local_state = _state->get_sink_local_state(); write_dependencies = local_state->dependencies(); - DCHECK(std::all_of(write_dependencies.begin(), write_dependencies.end(), - [](auto* dep) { return dep->is_write_dependency(); })); auto* fin_dep = local_state->finishdependency(); if (fin_dep) { finish_dependencies.push_back(fin_dep); @@ -404,10 +405,9 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m } return false; } else if (is_wg_mem_low_water_mark) { - int64_t query_weighted_limit = 0; - int64_t query_weighted_consumption = 0; - query_ctx->get_weighted_mem_info(query_weighted_limit, query_weighted_consumption); - if (query_weighted_limit == 0 || query_weighted_consumption < query_weighted_limit) { + int64_t spill_threshold = query_ctx->spill_threshold(); + int64_t memory_usage = query_ctx->query_mem_tracker->consumption(); + if (spill_threshold == 0 || memory_usage < spill_threshold) { return false; } auto big_memory_operator_num = query_ctx->get_running_big_mem_op_num(); @@ -416,7 +416,7 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m if (0 == big_memory_operator_num) { return false; } else { - mem_limit_of_op = query_weighted_limit / big_memory_operator_num; + mem_limit_of_op = spill_threshold / big_memory_operator_num; } LOG_EVERY_T(INFO, 1) << "query " << print_id(state->query_id()) @@ -425,10 +425,8 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m << ", mem_limit_of_op: " << PrettyPrinter::print_bytes(mem_limit_of_op) << ", min_revocable_mem_bytes: " << PrettyPrinter::print_bytes(min_revocable_mem_bytes) - << ", query_weighted_consumption: " - << PrettyPrinter::print_bytes(query_weighted_consumption) - << ", query_weighted_limit: " - << PrettyPrinter::print_bytes(query_weighted_limit) + << ", memory_usage: " << PrettyPrinter::print_bytes(memory_usage) + << ", spill_threshold: " << PrettyPrinter::print_bytes(spill_threshold) << ", big_memory_operator_num: " << big_memory_operator_num; return (revocable_mem_bytes > mem_limit_of_op || revocable_mem_bytes > min_revocable_mem_bytes); diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index 63f464c03ad36c..8fb4b4eb7992f5 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -136,6 +136,7 @@ class PipelineTask { bool is_finalized() const { return _finalized; } void clear_blocking_state() { + _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); // We use a lock to assure all dependencies are not deconstructed here. std::unique_lock lc(_dependency_lock); if (!_finalized) { diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index 845afb9a84b85c..6f8022a00342a3 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -24,6 +24,7 @@ #include // IWYU pragma: no_include #include // IWYU pragma: keep +#include #include #include #include @@ -80,6 +81,13 @@ void GetResultBatchCtx::on_data(const std::unique_ptr& t_resul result->set_packet_seq(packet_seq); result->set_eos(eos); } + + /// The size limit of proto buffer message is 2G + if (result->ByteSizeLong() > std::numeric_limits::max()) { + st = Status::InternalError("Message size exceeds 2GB: {}", result->ByteSizeLong()); + result->clear_row_batch(); + result->set_empty_batch(true); + } st.to_protobuf(result->mutable_status()); { done->Run(); } delete this; diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 3434d01a59e80c..65cf70bf568044 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -36,6 +36,13 @@ #include "runtime/frontend_info.h" // TODO(zhiqiang): find a way to remove this include header #include "util/threadpool.h" +namespace orc { +class MemoryPool; +} +namespace arrow { +class MemoryPool; +} + namespace doris { namespace vectorized { class VDataStreamMgr; @@ -197,6 +204,7 @@ class ExecEnv { ThreadPool* join_node_thread_pool() { return _join_node_thread_pool.get(); } ThreadPool* lazy_release_obj_pool() { return _lazy_release_obj_pool.get(); } ThreadPool* non_block_close_thread_pool(); + ThreadPool* s3_file_system_thread_pool() { return _s3_file_system_thread_pool.get(); } Status init_pipeline_task_scheduler(); void init_file_cache_factory(); @@ -305,6 +313,9 @@ class ExecEnv { segment_v2::TmpFileDirs* get_tmp_file_dirs() { return _tmp_file_dirs.get(); } io::FDCache* file_cache_open_fd_cache() const { return _file_cache_open_fd_cache.get(); } + orc::MemoryPool* orc_memory_pool() { return _orc_memory_pool; } + arrow::MemoryPool* arrow_memory_pool() { return _arrow_memory_pool; } + private: ExecEnv(); @@ -371,6 +382,7 @@ class ExecEnv { // Pool to use a new thread to release object std::unique_ptr _lazy_release_obj_pool; std::unique_ptr _non_block_close_thread_pool; + std::unique_ptr _s3_file_system_thread_pool; FragmentMgr* _fragment_mgr = nullptr; pipeline::TaskScheduler* _without_group_task_scheduler = nullptr; @@ -435,6 +447,9 @@ class ExecEnv { std::unique_ptr _pipeline_tracer_ctx; std::unique_ptr _tmp_file_dirs; doris::vectorized::SpillStreamManager* _spill_stream_mgr = nullptr; + + orc::MemoryPool* _orc_memory_pool = nullptr; + arrow::MemoryPool* _arrow_memory_pool = nullptr; }; template <> diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 32fbc4e0af415c..7ee8ee47f5ca4c 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -100,6 +100,8 @@ #include "util/threadpool.h" #include "util/thrift_rpc_helper.h" #include "util/timezone_utils.h" +#include "vec/exec/format/orc/orc_memory_pool.h" +#include "vec/exec/format/parquet/arrow_memory_pool.h" #include "vec/exec/scan/scanner_scheduler.h" #include "vec/runtime/vdata_stream_mgr.h" #include "vec/sink/delta_writer_v2_pool.h" @@ -263,6 +265,10 @@ Status ExecEnv::_init(const std::vector& store_paths, .set_min_threads(config::min_nonblock_close_thread_num) .set_max_threads(config::max_nonblock_close_thread_num) .build(&_non_block_close_thread_pool)); + static_cast(ThreadPoolBuilder("S3FileSystemThreadPool") + .set_min_threads(config::min_s3_file_system_thread_num) + .set_max_threads(config::max_s3_file_system_thread_num) + .build(&_s3_file_system_thread_pool)); // NOTE: runtime query statistics mgr could be visited by query and daemon thread // so it should be created before all query begin and deleted after all query and daemon thread stoppped @@ -339,7 +345,8 @@ Status ExecEnv::_init(const std::vector& store_paths, options.broken_paths = broken_paths; options.backend_uid = doris::UniqueId::gen_uid(); if (config::is_cloud_mode()) { - std::cout << "start BE in cloud mode" << std::endl; + std::cout << "start BE in cloud mode, cloud_unique_id: " << config::cloud_unique_id + << ", meta_service_endpoint: " << config::meta_service_endpoint << std::endl; _storage_engine = std::make_unique(options.backend_uid); } else { std::cout << "start BE in local mode" << std::endl; @@ -573,6 +580,10 @@ Status ExecEnv::_init_mem_env() { << PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES) << ", origin config value: " << config::inverted_index_query_cache_limit; + // init orc memory pool + _orc_memory_pool = new doris::vectorized::ORCMemoryPool(); + _arrow_memory_pool = new doris::vectorized::ArrowMemoryPool(); + return Status::OK(); } @@ -669,6 +680,7 @@ void ExecEnv::destroy() { SAFE_SHUTDOWN(_join_node_thread_pool); SAFE_SHUTDOWN(_lazy_release_obj_pool); SAFE_SHUTDOWN(_non_block_close_thread_pool); + SAFE_SHUTDOWN(_s3_file_system_thread_pool); SAFE_SHUTDOWN(_send_report_thread_pool); SAFE_SHUTDOWN(_send_batch_thread_pool); @@ -714,6 +726,7 @@ void ExecEnv::destroy() { _join_node_thread_pool.reset(nullptr); _lazy_release_obj_pool.reset(nullptr); _non_block_close_thread_pool.reset(nullptr); + _s3_file_system_thread_pool.reset(nullptr); _send_report_thread_pool.reset(nullptr); _send_table_stats_thread_pool.reset(nullptr); _buffered_reader_prefetch_thread_pool.reset(nullptr); @@ -751,6 +764,9 @@ void ExecEnv::destroy() { // We should free task scheduler finally because task queue / scheduler maybe used by pipelineX. SAFE_DELETE(_without_group_task_scheduler); + SAFE_DELETE(_arrow_memory_pool); + SAFE_DELETE(_orc_memory_pool); + // dns cache is a global instance and need to be released at last SAFE_DELETE(_dns_cache); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 5389bf2b7ec862..16305a8e91519e 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -710,7 +710,7 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, std::shared_ptr query_ctx; RETURN_IF_ERROR(_get_query_ctx(params, params.query_id, true, query_ctx)); - SCOPED_ATTACH_TASK_WITH_ID(query_ctx->query_mem_tracker, params.query_id); + SCOPED_ATTACH_TASK(query_ctx.get()); int64_t duration_ns = 0; std::shared_ptr context = std::make_shared( @@ -1054,7 +1054,6 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, QueryThreadContext query_thread_context; RuntimeFilterMgr* runtime_filter_mgr = nullptr; - ObjectPool* pool = nullptr; const auto& fragment_instance_ids = request->fragment_instance_ids(); { @@ -1071,9 +1070,9 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, DCHECK(pip_context != nullptr); runtime_filter_mgr = pip_context->get_query_ctx()->runtime_filter_mgr(); - pool = &pip_context->get_query_ctx()->obj_pool; query_thread_context = {pip_context->get_query_ctx()->query_id(), - pip_context->get_query_ctx()->query_mem_tracker}; + pip_context->get_query_ctx()->query_mem_tracker, + pip_context->get_query_ctx()->workload_group()}; } else { return Status::InternalError("Non-pipeline is disabled!"); } @@ -1088,13 +1087,13 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, SCOPED_ATTACH_TASK(query_thread_context); // 1. get the target filters - std::vector filters; + std::vector> filters; RETURN_IF_ERROR(runtime_filter_mgr->get_consume_filters(request->filter_id(), filters)); // 2. create the filter wrapper to replace or ignore the target filters if (!filters.empty()) { - UpdateRuntimeFilterParamsV2 params {request, attach_data, pool, filters[0]->column_type()}; - RuntimePredicateWrapper* filter_wrapper = nullptr; + UpdateRuntimeFilterParamsV2 params {request, attach_data, filters[0]->column_type()}; + std::shared_ptr filter_wrapper; RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, &filter_wrapper)); std::ranges::for_each(filters, [&](auto& filter) { @@ -1149,8 +1148,6 @@ Status FragmentMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data) { UniqueId queryid = request->query_id(); - std::shared_ptr filter_controller; - RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); std::shared_ptr query_ctx; { @@ -1165,7 +1162,9 @@ Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, queryid.to_string()); } } - SCOPED_ATTACH_TASK_WITH_ID(query_ctx->query_mem_tracker, query_ctx->query_id()); + SCOPED_ATTACH_TASK(query_ctx.get()); + std::shared_ptr filter_controller; + RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); auto merge_status = filter_controller->merge(request, attach_data); return merge_status; } diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 5f989da023b36a..6a6061d42cf9e2 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -436,12 +436,21 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.err_status", { status = Status::InternalError(""); }); if (status.ok()) { + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.commit_error", + { status = Status::InternalError(""); }); // commit txn TLoadTxnCommitRequest request; request.__set_auth_code(0); // this is a fake, fe not check it now request.__set_db_id(db_id); request.__set_table_id(table_id); request.__set_txnId(txn_id); + request.__set_groupCommit(true); + request.__set_receiveBytes(state->num_bytes_load_total()); + if (_exec_env->master_info()->__isset.backend_id) { + request.__set_backendId(_exec_env->master_info()->backend_id); + } else { + LOG(WARNING) << "_exec_env->master_info not set backend_id"; + } if (state) { request.__set_commitInfos(state->tablet_commit_infos()); } diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index b7df43b65b165c..99f0a0b3d5bb95 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -29,6 +29,7 @@ #include "runtime/memory/mem_tracker.h" #include "runtime/tablets_channel.h" #include "runtime/thread_context.h" +#include "runtime/workload_group/workload_group_manager.h" namespace doris { @@ -46,7 +47,8 @@ LoadChannel::LoadChannel(const UniqueId& load_id, int64_t timeout_s, bool is_hig ExecEnv::GetInstance()->fragment_mgr()->get_or_erase_query_ctx_with_lock( _load_id.to_thrift()); if (query_context != nullptr) { - _query_thread_context = {_load_id.to_thrift(), query_context->query_mem_tracker}; + _query_thread_context = {_load_id.to_thrift(), query_context->query_mem_tracker, + query_context->workload_group()}; } else { _query_thread_context = { _load_id.to_thrift(), diff --git a/be/src/runtime/load_path_mgr.cpp b/be/src/runtime/load_path_mgr.cpp index f961fa9b7ecff3..f1899aadb28281 100644 --- a/be/src/runtime/load_path_mgr.cpp +++ b/be/src/runtime/load_path_mgr.cpp @@ -174,7 +174,7 @@ void LoadPathMgr::process_path(time_t now, const std::string& path, int64_t rese if (status.ok()) { LOG(INFO) << "Remove path success. path=" << path; } else { - LOG(WARNING) << "Remove path failed. path=" << path; + LOG(WARNING) << "Remove path failed. path=" << path << ", error=" << status; } } diff --git a/be/src/runtime/load_stream.cpp b/be/src/runtime/load_stream.cpp index 07d488e578fe12..c818c4664a0689 100644 --- a/be/src/runtime/load_stream.cpp +++ b/be/src/runtime/load_stream.cpp @@ -40,6 +40,7 @@ #include "runtime/load_channel.h" #include "runtime/load_stream_mgr.h" #include "runtime/load_stream_writer.h" +#include "runtime/workload_group/workload_group_manager.h" #include "util/debug_points.h" #include "util/runtime_profile.h" #include "util/thrift_util.h" @@ -84,6 +85,7 @@ Status TabletStream::init(std::shared_ptr schema, int64_t .load_id = _load_id, .table_schema_param = schema, // TODO(plat1ko): write_file_cache + .storage_vault_id {}, }; _load_stream_writer = std::make_shared(&req, _profile); @@ -360,7 +362,8 @@ LoadStream::LoadStream(PUniqueId load_id, LoadStreamMgr* load_stream_mgr, bool e std::shared_ptr query_context = ExecEnv::GetInstance()->fragment_mgr()->get_or_erase_query_ctx_with_lock(load_tid); if (query_context != nullptr) { - _query_thread_context = {load_tid, query_context->query_mem_tracker}; + _query_thread_context = {load_tid, query_context->query_mem_tracker, + query_context->workload_group()}; } else { _query_thread_context = {load_tid, MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::LOAD, diff --git a/be/src/runtime/memory/cache_manager.cpp b/be/src/runtime/memory/cache_manager.cpp index 9bf3d1e12d0c8c..a6516c40a35770 100644 --- a/be/src/runtime/memory/cache_manager.cpp +++ b/be/src/runtime/memory/cache_manager.cpp @@ -48,24 +48,22 @@ int64_t CacheManager::for_each_cache_prune_stale(RuntimeProfile* profile) { return 0; } -int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile) { - if (need_prune(&_last_prune_all_timestamp, "all")) { +int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile, bool force) { + if (force || need_prune(&_last_prune_all_timestamp, "all")) { return for_each_cache_prune_stale_wrap( - [](CachePolicy* cache_policy) { cache_policy->prune_all(false); }, profile); + [force](CachePolicy* cache_policy) { cache_policy->prune_all(force); }, profile); } return 0; } -void CacheManager::clear_once() { +int64_t CacheManager::cache_prune_all(CachePolicy::CacheType type, bool force) { std::lock_guard l(_caches_lock); - for (const auto& pair : _caches) { - pair.second->prune_all(true); + auto* cache_policy = _caches[type]; + if (!cache_policy->enable_prune()) { + return -1; } -} - -void CacheManager::clear_once(CachePolicy::CacheType type) { - std::lock_guard l(_caches_lock); - _caches[type]->prune_all(true); // will print log + cache_policy->prune_all(force); + return cache_policy->profile()->get_counter("FreedMemory")->value(); } } // namespace doris diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h index 20372366aa1a7d..d94dca501670bf 100644 --- a/be/src/runtime/memory/cache_manager.h +++ b/be/src/runtime/memory/cache_manager.h @@ -64,10 +64,9 @@ class CacheManager { int64_t for_each_cache_prune_stale(RuntimeProfile* profile = nullptr); - int64_t for_each_cache_prune_all(RuntimeProfile* profile = nullptr); - - void clear_once(); - void clear_once(CachePolicy::CacheType type); + // if force is true, regardless of the two prune interval and cache size, cache will be pruned this time. + int64_t for_each_cache_prune_all(RuntimeProfile* profile = nullptr, bool force = false); + int64_t cache_prune_all(CachePolicy::CacheType type, bool force = false); bool need_prune(int64_t* last_timestamp, const std::string& type) { int64_t now = UnixSeconds(); diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index e59c5c7ac3e978..c457afd86898f2 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -47,6 +47,7 @@ class CachePolicy { CREATE_TABLET_RR_IDX_CACHE = 15, CLOUD_TABLET_CACHE = 16, CLOUD_TXN_DELETE_BITMAP_CACHE = 17, + NONE = 18, // not be used }; static std::string type_string(CacheType type) { @@ -94,6 +95,34 @@ class CachePolicy { __builtin_unreachable(); } + inline static std::unordered_map StringToType = { + {"DataPageCache", CacheType::DATA_PAGE_CACHE}, + {"IndexPageCache", CacheType::INDEXPAGE_CACHE}, + {"PKIndexPageCache", CacheType::PK_INDEX_PAGE_CACHE}, + {"SchemaCache", CacheType::SCHEMA_CACHE}, + {"SegmentCache", CacheType::SEGMENT_CACHE}, + {"InvertedIndexSearcherCache", CacheType::INVERTEDINDEX_SEARCHER_CACHE}, + {"InvertedIndexQueryCache", CacheType::INVERTEDINDEX_QUERY_CACHE}, + {"PointQueryLookupConnectionCache", CacheType::LOOKUP_CONNECTION_CACHE}, + {"PointQueryRowCache", CacheType::POINT_QUERY_ROW_CACHE}, + {"MowDeleteBitmapAggCache", CacheType::DELETE_BITMAP_AGG_CACHE}, + {"MowTabletVersionCache", CacheType::TABLET_VERSION_CACHE}, + {"LastSuccessChannelCache", CacheType::LAST_SUCCESS_CHANNEL_CACHE}, + {"CommonObjLRUCache", CacheType::COMMON_OBJ_LRU_CACHE}, + {"ForUT", CacheType::FOR_UT}, + {"TabletSchemaCache", CacheType::TABLET_SCHEMA_CACHE}, + {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE}, + {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE}, + {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}}; + + static CacheType string_to_type(std::string type) { + if (StringToType.contains(type)) { + return StringToType[type]; + } else { + return CacheType::NONE; + } + } + CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune); virtual ~CachePolicy(); diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp index 35fa350987f34f..344bcbc59846d9 100644 --- a/be/src/runtime/memory/global_memory_arbitrator.cpp +++ b/be/src/runtime/memory/global_memory_arbitrator.cpp @@ -40,7 +40,7 @@ std::atomic GlobalMemoryArbitrator::_s_process_reserved_memory = 0; std::atomic GlobalMemoryArbitrator::refresh_interval_memory_growth = 0; bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { - if (sys_mem_available() - bytes < MemInfo::sys_mem_available_low_water_mark()) { + if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) { return false; } int64_t old_reserved_mem = _s_process_reserved_memory.load(std::memory_order_relaxed); @@ -50,7 +50,7 @@ bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { if (UNLIKELY(vm_rss_sub_allocator_cache() + refresh_interval_memory_growth.load(std::memory_order_relaxed) + new_reserved_mem >= - MemInfo::mem_limit())) { + MemInfo::soft_mem_limit())) { return false; } } while (!_s_process_reserved_memory.compare_exchange_weak(old_reserved_mem, new_reserved_mem, diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index 46eddfa9810704..cc695a6fdd51e1 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -43,7 +43,17 @@ namespace doris { -bvar::Adder g_memtrackerlimiter_cnt("memtrackerlimiter_cnt"); +static bvar::Adder memory_memtrackerlimiter_cnt("memory_memtrackerlimiter_cnt"); +static bvar::Adder memory_all_trackers_sum_bytes("memory_all_trackers_sum_bytes"); +static bvar::Adder memory_global_trackers_sum_bytes("memory_global_trackers_sum_bytes"); +static bvar::Adder memory_query_trackers_sum_bytes("memory_query_trackers_sum_bytes"); +static bvar::Adder memory_load_trackers_sum_bytes("memory_load_trackers_sum_bytes"); +static bvar::Adder memory_compaction_trackers_sum_bytes( + "memory_compaction_trackers_sum_bytes"); +static bvar::Adder memory_schema_change_trackers_sum_bytes( + "memory_schema_change_trackers_sum_bytes"); +static bvar::Adder memory_other_trackers_sum_bytes("memory_other_trackers_sum_bytes"); + constexpr auto GC_MAX_SEEK_TRACKER = 1000; std::atomic MemTrackerLimiter::_enable_print_log_process_usage {true}; @@ -80,7 +90,7 @@ MemTrackerLimiter::MemTrackerLimiter(Type type, const std::string& label, int64_ if (_type == Type::LOAD || _type == Type::QUERY) { _query_statistics = std::make_shared(); } - g_memtrackerlimiter_cnt << 1; + memory_memtrackerlimiter_cnt << 1; } std::shared_ptr MemTrackerLimiter::create_shared(MemTrackerLimiter::Type type, @@ -137,7 +147,7 @@ MemTrackerLimiter::~MemTrackerLimiter() { << print_address_sanitizers(); #endif } - g_memtrackerlimiter_cnt << -1; + memory_memtrackerlimiter_cnt << -1; } #ifndef NDEBUG @@ -223,9 +233,40 @@ void MemTrackerLimiter::refresh_global_counter() { } } } + int64_t all_trackers_mem_sum = 0; for (auto it : type_mem_sum) { MemTrackerLimiter::TypeMemSum[it.first]->set(it.second); + all_trackers_mem_sum += it.second; + switch (it.first) { + case Type::GLOBAL: + memory_global_trackers_sum_bytes + << it.second - memory_global_trackers_sum_bytes.get_value(); + break; + case Type::QUERY: + memory_query_trackers_sum_bytes + << it.second - memory_query_trackers_sum_bytes.get_value(); + break; + case Type::LOAD: + memory_load_trackers_sum_bytes + << it.second - memory_load_trackers_sum_bytes.get_value(); + break; + case Type::COMPACTION: + memory_compaction_trackers_sum_bytes + << it.second - memory_compaction_trackers_sum_bytes.get_value(); + break; + case Type::SCHEMA_CHANGE: + memory_schema_change_trackers_sum_bytes + << it.second - memory_schema_change_trackers_sum_bytes.get_value(); + break; + case Type::OTHER: + memory_other_trackers_sum_bytes + << it.second - memory_other_trackers_sum_bytes.get_value(); + } } + all_trackers_mem_sum += MemInfo::allocator_cache_mem(); + all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); + memory_all_trackers_sum_bytes << all_trackers_mem_sum - + memory_all_trackers_sum_bytes.get_value(); } void MemTrackerLimiter::clean_tracker_limiter_group() { @@ -248,7 +289,7 @@ void MemTrackerLimiter::clean_tracker_limiter_group() { void MemTrackerLimiter::make_process_snapshots(std::vector* snapshots) { MemTrackerLimiter::refresh_global_counter(); - int64_t all_tracker_mem_sum = 0; + int64_t all_trackers_mem_sum = 0; Snapshot snapshot; for (auto it : MemTrackerLimiter::TypeMemSum) { snapshot.type = "overview"; @@ -257,7 +298,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = it.second->current_value(); snapshot.peak_consumption = it.second->peak_value(); (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += it.second->current_value(); + all_trackers_mem_sum += it.second->current_value(); } snapshot.type = "overview"; @@ -266,7 +307,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = MemInfo::allocator_cache_mem(); snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += MemInfo::allocator_cache_mem(); + all_trackers_mem_sum += MemInfo::allocator_cache_mem(); snapshot.type = "overview"; snapshot.label = "tc/jemalloc_metadata"; @@ -274,20 +315,28 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = MemInfo::allocator_metadata_mem(); snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += MemInfo::allocator_metadata_mem(); + all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); + + snapshot.type = "overview"; + snapshot.label = "reserved_memory"; + snapshot.limit = -1; + snapshot.cur_consumption = GlobalMemoryArbitrator::process_reserved_memory(); + snapshot.peak_consumption = -1; + (*snapshots).emplace_back(snapshot); + all_trackers_mem_sum += GlobalMemoryArbitrator::process_reserved_memory(); snapshot.type = "overview"; - snapshot.label = "sum of all trackers"; // is virtual memory + snapshot.label = "sum_of_all_trackers"; // is virtual memory snapshot.limit = -1; - snapshot.cur_consumption = all_tracker_mem_sum; + snapshot.cur_consumption = all_trackers_mem_sum; snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); snapshot.type = "overview"; #ifdef ADDRESS_SANITIZER - snapshot.label = "[ASAN]process resident memory"; // from /proc VmRSS VmHWM + snapshot.label = "[ASAN]VmRSS(process resident memory)"; // from /proc VmRSS VmHWM #else - snapshot.label = "process resident memory"; // from /proc VmRSS VmHWM + snapshot.label = "VmRSS(process resident memory)"; // from /proc VmRSS VmHWM #endif snapshot.limit = -1; snapshot.cur_consumption = PerfCounters::get_vm_rss(); @@ -295,14 +344,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector (*snapshots).emplace_back(snapshot); snapshot.type = "overview"; - snapshot.label = "reserve_memory"; - snapshot.limit = -1; - snapshot.cur_consumption = GlobalMemoryArbitrator::process_reserved_memory(); - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - - snapshot.type = "overview"; - snapshot.label = "process virtual memory"; // from /proc VmSize VmPeak + snapshot.label = "VmSize(process virtual memory)"; // from /proc VmSize VmPeak snapshot.limit = -1; snapshot.cur_consumption = PerfCounters::get_vm_size(); snapshot.peak_consumption = PerfCounters::get_vm_peak(); diff --git a/be/src/runtime/memory/mem_tracker_limiter.h b/be/src/runtime/memory/mem_tracker_limiter.h index dd2b89029cb7d3..e5c5cb1bc0369c 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.h +++ b/be/src/runtime/memory/mem_tracker_limiter.h @@ -221,7 +221,7 @@ class MemTrackerLimiter final : public MemTracker { } // Iterator into mem_tracker_limiter_pool for this object. Stored to have O(1) remove. - std::list>::iterator tg_tracker_limiter_group_it; + std::list>::iterator wg_tracker_limiter_group_it; private: friend class ThreadMemTrackerMgr; diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp index 536c4658c8c515..3adf1d1ac75718 100644 --- a/be/src/runtime/memory/memory_reclamation.cpp +++ b/be/src/runtime/memory/memory_reclamation.cpp @@ -47,7 +47,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) { }}; freed_mem += CacheManager::instance()->for_each_cache_prune_stale(profile.get()); - MemInfo::notify_je_purge_dirty_pages(); if (freed_mem > MemInfo::process_minor_gc_size()) { return true; } @@ -98,7 +97,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) { }}; freed_mem += CacheManager::instance()->for_each_cache_prune_all(profile.get()); - MemInfo::notify_je_purge_dirty_pages(); if (freed_mem > MemInfo::process_full_gc_size()) { return true; } diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h index 9d36cd2d807813..d9c4e093a4acfb 100644 --- a/be/src/runtime/memory/thread_mem_tracker_mgr.h +++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h @@ -33,6 +33,7 @@ #include "runtime/memory/global_memory_arbitrator.h" #include "runtime/memory/mem_tracker.h" #include "runtime/memory/mem_tracker_limiter.h" +#include "runtime/workload_group/workload_group.h" #include "util/stack_util.h" #include "util/uid_util.h" @@ -71,6 +72,10 @@ class ThreadMemTrackerMgr { TUniqueId query_id() { return _query_id; } + void set_wg_wptr(const std::weak_ptr& wg_wptr) { _wg_wptr = wg_wptr; } + + void reset_wg_wptr() { _wg_wptr.reset(); } + void start_count_scope_mem() { CHECK(init()); _scope_mem = _reserved_mem; // consume in advance @@ -151,6 +156,7 @@ class ThreadMemTrackerMgr { std::shared_ptr _limiter_tracker; MemTrackerLimiter* _limiter_tracker_raw = nullptr; std::vector _consumer_tracker_stack; + std::weak_ptr _wg_wptr; // If there is a memory new/delete operation in the consume method, it may enter infinite recursion. bool _stop_consume = false; @@ -236,8 +242,8 @@ inline void ThreadMemTrackerMgr::consume(int64_t size, int skip_large_memory_che flush_untracked_mem(); } - if (skip_large_memory_check == 0 && doris::config::large_memory_check_bytes > 0 && - size > doris::config::large_memory_check_bytes) { + if (skip_large_memory_check == 0 && doris::config::stacktrace_in_alloc_large_memory_bytes > 0 && + size > doris::config::stacktrace_in_alloc_large_memory_bytes) { _stop_consume = true; LOG(WARNING) << fmt::format( "malloc or new large memory: {}, {}, this is just a warning, not prevent memory " @@ -287,8 +293,16 @@ inline bool ThreadMemTrackerMgr::try_reserve(int64_t size) { if (!_limiter_tracker_raw->try_consume(size)) { return false; } + auto wg_ptr = _wg_wptr.lock(); + if (wg_ptr) { + if (!wg_ptr->add_wg_refresh_interval_memory_growth(size)) { + _limiter_tracker_raw->release(size); // rollback + return false; + } + } if (!doris::GlobalMemoryArbitrator::try_reserve_process_memory(size)) { - _limiter_tracker_raw->release(size); // rollback + _limiter_tracker_raw->release(size); // rollback + wg_ptr->sub_wg_refresh_interval_memory_growth(size); // rollback return false; } if (_count_scope_mem) { @@ -306,6 +320,10 @@ inline void ThreadMemTrackerMgr::release_reserved() { doris::GlobalMemoryArbitrator::release_process_reserved_memory(_reserved_mem + _untracked_mem); _limiter_tracker_raw->release(_reserved_mem); + auto wg_ptr = _wg_wptr.lock(); + if (!wg_ptr) { + wg_ptr->sub_wg_refresh_interval_memory_growth(_reserved_mem); + } if (_count_scope_mem) { _scope_mem -= _reserved_mem; } diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index b565214ef22082..1bbccfd33b7403 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -230,17 +230,8 @@ class QueryContext { return _running_big_mem_op_num.load(std::memory_order_relaxed); } - void set_weighted_mem(int64_t weighted_limit, int64_t weighted_consumption) { - std::lock_guard l(_weighted_mem_lock); - _weighted_consumption = weighted_consumption; - _weighted_limit = weighted_limit; - } - void get_weighted_mem_info(int64_t& weighted_limit, int64_t& weighted_consumption) { - std::lock_guard l(_weighted_mem_lock); - weighted_limit = _weighted_limit; - weighted_consumption = _weighted_consumption; - } - + void set_spill_threshold(int64_t spill_threshold) { _spill_threshold = spill_threshold; } + int64_t spill_threshold() { return _spill_threshold; } DescriptorTbl* desc_tbl = nullptr; bool set_rsc_info = false; std::string user; @@ -310,9 +301,7 @@ class QueryContext { std::map> _fragment_id_to_pipeline_ctx; std::mutex _pipeline_map_write_lock; - std::mutex _weighted_mem_lock; - int64_t _weighted_consumption = 0; - int64_t _weighted_limit = 0; + std::atomic _spill_threshold {0}; std::mutex _profile_mutex; diff --git a/be/src/runtime/query_statistics.cpp b/be/src/runtime/query_statistics.cpp index 126fb10af5bf0f..110efef5ab920f 100644 --- a/be/src/runtime/query_statistics.cpp +++ b/be/src/runtime/query_statistics.cpp @@ -27,22 +27,20 @@ namespace doris { void QueryStatistics::merge(const QueryStatistics& other) { - scan_rows += other.scan_rows.load(std::memory_order_relaxed); - scan_bytes += other.scan_bytes.load(std::memory_order_relaxed); - cpu_nanos += other.cpu_nanos.load(std::memory_order_relaxed); - shuffle_send_bytes += other.shuffle_send_bytes.load(std::memory_order_relaxed); - shuffle_send_rows += other.shuffle_send_rows.load(std::memory_order_relaxed); - _scan_bytes_from_local_storage += - other._scan_bytes_from_local_storage.load(std::memory_order_relaxed); - _scan_bytes_from_remote_storage += - other._scan_bytes_from_remote_storage.load(std::memory_order_relaxed); - - int64_t other_peak_mem = other.max_peak_memory_bytes.load(std::memory_order_relaxed); + scan_rows += other.scan_rows; + scan_bytes += other.scan_bytes; + cpu_nanos += other.cpu_nanos; + shuffle_send_bytes += other.shuffle_send_bytes; + shuffle_send_rows += other.shuffle_send_rows; + _scan_bytes_from_local_storage += other._scan_bytes_from_local_storage; + _scan_bytes_from_remote_storage += other._scan_bytes_from_remote_storage; + + int64_t other_peak_mem = other.max_peak_memory_bytes; if (other_peak_mem > this->max_peak_memory_bytes) { this->max_peak_memory_bytes = other_peak_mem; } - int64_t other_memory_used = other.current_used_memory_bytes.load(std::memory_order_relaxed); + int64_t other_memory_used = other.current_used_memory_bytes; if (other_memory_used > 0) { this->current_used_memory_bytes = other_memory_used; } @@ -61,15 +59,14 @@ void QueryStatistics::to_pb(PQueryStatistics* statistics) { void QueryStatistics::to_thrift(TQueryStatistics* statistics) const { DCHECK(statistics != nullptr); - statistics->__set_scan_bytes(scan_bytes.load(std::memory_order_relaxed)); - statistics->__set_scan_rows(scan_rows.load(std::memory_order_relaxed)); - statistics->__set_cpu_ms(cpu_nanos.load(std::memory_order_relaxed) / NANOS_PER_MILLIS); - statistics->__set_returned_rows(returned_rows.load(std::memory_order_relaxed)); - statistics->__set_max_peak_memory_bytes(max_peak_memory_bytes.load(std::memory_order_relaxed)); - statistics->__set_current_used_memory_bytes( - current_used_memory_bytes.load(std::memory_order_relaxed)); - statistics->__set_shuffle_send_bytes(shuffle_send_bytes.load(std::memory_order_relaxed)); - statistics->__set_shuffle_send_rows(shuffle_send_rows.load(std::memory_order_relaxed)); + statistics->__set_scan_bytes(scan_bytes); + statistics->__set_scan_rows(scan_rows); + statistics->__set_cpu_ms(cpu_nanos / NANOS_PER_MILLIS); + statistics->__set_returned_rows(returned_rows); + statistics->__set_max_peak_memory_bytes(max_peak_memory_bytes); + statistics->__set_current_used_memory_bytes(current_used_memory_bytes); + statistics->__set_shuffle_send_bytes(shuffle_send_bytes); + statistics->__set_shuffle_send_rows(shuffle_send_rows); statistics->__set_scan_bytes_from_remote_storage(_scan_bytes_from_remote_storage); statistics->__set_scan_bytes_from_local_storage(_scan_bytes_from_local_storage); } @@ -82,42 +79,6 @@ void QueryStatistics::from_pb(const PQueryStatistics& statistics) { _scan_bytes_from_remote_storage = statistics.scan_bytes_from_remote_storage(); } -void QueryStatistics::merge(QueryStatisticsRecvr* recvr) { - recvr->merge(this); -} - -void QueryStatistics::merge(QueryStatisticsRecvr* recvr, int sender_id) { - DCHECK(recvr != nullptr); - auto QueryStatisticsptr = recvr->find(sender_id); - if (QueryStatisticsptr) { - merge(*QueryStatisticsptr); - } -} - QueryStatistics::~QueryStatistics() {} -void QueryStatisticsRecvr::insert(const PQueryStatistics& statistics, int sender_id) { - std::lock_guard l(_lock); - if (!_query_statistics.contains(sender_id)) { - _query_statistics[sender_id] = std::make_shared(); - } - _query_statistics[sender_id]->from_pb(statistics); -} - -void QueryStatisticsRecvr::insert(QueryStatisticsPtr statistics, int sender_id) { - if (!statistics->collected()) return; - if (_query_statistics.contains(sender_id)) return; - std::lock_guard l(_lock); - _query_statistics[sender_id] = statistics; -} - -QueryStatisticsPtr QueryStatisticsRecvr::find(int sender_id) { - std::lock_guard l(_lock); - auto it = _query_statistics.find(sender_id); - if (it != _query_statistics.end()) { - return it->second; - } - return nullptr; -} - } // namespace doris diff --git a/be/src/runtime/query_statistics.h b/be/src/runtime/query_statistics.h index e71a136789a2c3..0a19dfd46f0a08 100644 --- a/be/src/runtime/query_statistics.h +++ b/be/src/runtime/query_statistics.h @@ -31,7 +31,6 @@ namespace doris { -class QueryStatisticsRecvr; class PNodeStatistics; class PQueryStatistics; @@ -53,82 +52,44 @@ class QueryStatistics { void merge(const QueryStatistics& other); - void add_scan_rows(int64_t delta_scan_rows) { - this->scan_rows.fetch_add(delta_scan_rows, std::memory_order_relaxed); - } + void add_scan_rows(int64_t delta_scan_rows) { scan_rows += delta_scan_rows; } - void add_scan_bytes(int64_t delta_scan_bytes) { - this->scan_bytes.fetch_add(delta_scan_bytes, std::memory_order_relaxed); - } + void add_scan_bytes(int64_t delta_scan_bytes) { scan_bytes += delta_scan_bytes; } - void add_cpu_nanos(int64_t delta_cpu_time) { - this->cpu_nanos.fetch_add(delta_cpu_time, std::memory_order_relaxed); - } + void add_cpu_nanos(int64_t delta_cpu_time) { cpu_nanos += delta_cpu_time; } - void add_shuffle_send_bytes(int64_t delta_bytes) { - this->shuffle_send_bytes.fetch_add(delta_bytes, std::memory_order_relaxed); - } + void add_shuffle_send_bytes(int64_t delta_bytes) { shuffle_send_bytes += delta_bytes; } - void add_shuffle_send_rows(int64_t delta_rows) { - this->shuffle_send_rows.fetch_add(delta_rows, std::memory_order_relaxed); - } + void add_shuffle_send_rows(int64_t delta_rows) { shuffle_send_rows += delta_rows; } void add_scan_bytes_from_local_storage(int64_t scan_bytes_from_local_storage) { - this->_scan_bytes_from_local_storage += scan_bytes_from_local_storage; + _scan_bytes_from_local_storage += scan_bytes_from_local_storage; } void add_scan_bytes_from_remote_storage(int64_t scan_bytes_from_remote_storage) { - this->_scan_bytes_from_remote_storage += scan_bytes_from_remote_storage; + _scan_bytes_from_remote_storage += scan_bytes_from_remote_storage; } - void add_returned_rows(int64_t num_rows) { - this->returned_rows.fetch_add(num_rows, std::memory_order_relaxed); - } + void add_returned_rows(int64_t num_rows) { returned_rows += num_rows; } void set_max_peak_memory_bytes(int64_t max_peak_memory_bytes) { - this->max_peak_memory_bytes.store(max_peak_memory_bytes, std::memory_order_relaxed); + this->max_peak_memory_bytes = max_peak_memory_bytes; } void set_current_used_memory_bytes(int64_t current_used_memory) { - this->current_used_memory_bytes.store(current_used_memory, std::memory_order_relaxed); - } - - void merge(QueryStatisticsRecvr* recvr); - - void merge(QueryStatisticsRecvr* recvr, int sender_id); - - void clearNodeStatistics(); - - void clear() { - scan_rows.store(0, std::memory_order_relaxed); - scan_bytes.store(0, std::memory_order_relaxed); - cpu_nanos.store(0, std::memory_order_relaxed); - shuffle_send_bytes.store(0, std::memory_order_relaxed); - shuffle_send_rows.store(0, std::memory_order_relaxed); - _scan_bytes_from_local_storage.store(0); - _scan_bytes_from_remote_storage.store(0); - - returned_rows.store(0, std::memory_order_relaxed); - max_peak_memory_bytes.store(0, std::memory_order_relaxed); - clearNodeStatistics(); - //clear() is used before collection, so calling "clear" is equivalent to being collected. - set_collected(); + current_used_memory_bytes = current_used_memory; } void to_pb(PQueryStatistics* statistics); void to_thrift(TQueryStatistics* statistics) const; void from_pb(const PQueryStatistics& statistics); bool collected() const { return _collected; } - void set_collected() { _collected = true; } - int64_t get_scan_rows() { return scan_rows.load(std::memory_order_relaxed); } - int64_t get_scan_bytes() { return scan_bytes.load(std::memory_order_relaxed); } - int64_t get_current_used_memory_bytes() { - return current_used_memory_bytes.load(std::memory_order_relaxed); - } + int64_t get_scan_rows() { return scan_rows; } + int64_t get_scan_bytes() { return scan_bytes; } + int64_t get_current_used_memory_bytes() { return current_used_memory_bytes; } private: - friend class QueryStatisticsRecvr; std::atomic scan_rows; std::atomic scan_bytes; std::atomic cpu_nanos; @@ -148,30 +109,5 @@ class QueryStatistics { }; using QueryStatisticsPtr = std::shared_ptr; // It is used for collecting sub plan query statistics in DataStreamRecvr. -class QueryStatisticsRecvr { -public: - ~QueryStatisticsRecvr() = default; - - // Transmitted via RPC, incurring serialization overhead. - void insert(const PQueryStatistics& statistics, int sender_id); - - // using local_exchange for transmission, only need to hold a shared pointer. - void insert(QueryStatisticsPtr statistics, int sender_id); - - QueryStatisticsPtr find(int sender_id); - -private: - friend class QueryStatistics; - - void merge(QueryStatistics* statistics) { - std::lock_guard l(_lock); - for (auto& pair : _query_statistics) { - statistics->merge(*(pair.second)); - } - } - - std::map _query_statistics; - std::mutex _lock; -}; } // namespace doris diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 0e5b37c8ffa220..625b487d0ee1f3 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -58,8 +58,8 @@ RuntimeFilterMgr::~RuntimeFilterMgr() { _pool.clear(); } -Status RuntimeFilterMgr::get_consume_filters(const int filter_id, - std::vector& consumer_filters) { +Status RuntimeFilterMgr::get_consume_filters( + const int filter_id, std::vector>& consumer_filters) { std::lock_guard l(_lock); auto iter = _consumer_map.find(filter_id); if (iter == _consumer_map.end()) { @@ -74,7 +74,7 @@ Status RuntimeFilterMgr::get_consume_filters(const int filter_id, Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, int node_id, - IRuntimeFilter** consumer_filter, + std::shared_ptr* consumer_filter, bool build_bf_exactly, bool need_local_merge) { SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; @@ -91,10 +91,10 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc } if (!has_exist) { - IRuntimeFilter* filter; - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, - RuntimeFilterRole::CONSUMER, node_id, &filter, - build_bf_exactly, need_local_merge)); + std::shared_ptr filter; + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::CONSUMER, + node_id, &filter, build_bf_exactly, + need_local_merge)); _consumer_map[key].emplace_back(node_id, filter); *consumer_filter = filter; } else if (!need_local_merge) { @@ -106,7 +106,7 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc Status RuntimeFilterMgr::register_local_merge_producer_filter( const doris::TRuntimeFilterDesc& desc, const doris::TQueryOptions& options, - doris::IRuntimeFilter** producer_filter, bool build_bf_exactly) { + std::shared_ptr* producer_filter, bool build_bf_exactly) { SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; @@ -121,14 +121,13 @@ Status RuntimeFilterMgr::register_local_merge_producer_filter( } DCHECK(_state != nullptr); - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, - RuntimeFilterRole::PRODUCER, -1, producer_filter, - build_bf_exactly, true)); + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, + producer_filter, build_bf_exactly, true)); { std::lock_guard l(*iter->second.lock); if (iter->second.filters.empty()) { - IRuntimeFilter* merge_filter = nullptr; - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, + std::shared_ptr merge_filter; + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, &merge_filter, build_bf_exactly, true)); iter->second.filters.emplace_back(merge_filter); @@ -158,7 +157,7 @@ Status RuntimeFilterMgr::get_local_merge_producer_filters( Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - IRuntimeFilter** producer_filter, + std::shared_ptr* producer_filter, bool build_bf_exactly) { SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; @@ -169,9 +168,8 @@ Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc if (iter != _producer_map.end()) { return Status::InvalidArgument("filter has registed"); } - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, - RuntimeFilterRole::PRODUCER, -1, producer_filter, - build_bf_exactly)); + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, + producer_filter, build_bf_exactly)); _producer_map.emplace(key, *producer_filter); return Status::OK(); } @@ -179,9 +177,9 @@ Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, butil::IOBufAsZeroCopyInputStream* data) { SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); - UpdateRuntimeFilterParams params(request, data, &_pool); + UpdateRuntimeFilterParams params(request, data); int filter_id = request->filter_id(); - std::vector filters; + std::vector> filters; // The code is organized for upgrade compatibility to prevent infinite waiting // old way update filter the code should be deleted after the upgrade is complete. { @@ -196,7 +194,7 @@ Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, } iter->second.clear(); } - for (auto* filter : filters) { + for (auto filter : filters) { RETURN_IF_ERROR(filter->update_filter(¶ms)); } @@ -233,8 +231,7 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( cnt_val->runtime_filter_desc = *runtime_filter_desc; cnt_val->target_info = *target_info; cnt_val->pool.reset(new ObjectPool()); - cnt_val->filter = cnt_val->pool->add( - new IRuntimeFilter(_state, &_state->get_query_ctx()->obj_pool, runtime_filter_desc)); + cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; RETURN_IF_ERROR(cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options, @@ -254,8 +251,7 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( cnt_val->runtime_filter_desc = *runtime_filter_desc; cnt_val->targetv2_info = *targetv2_info; cnt_val->pool.reset(new ObjectPool()); - cnt_val->filter = cnt_val->pool->add( - new IRuntimeFilter(_state, &_state->get_query_ctx()->obj_pool, runtime_filter_desc)); + cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; RETURN_IF_ERROR(cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options)); @@ -355,7 +351,7 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz } Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { - auto* filter = try_get_product_filter(request->filter_id()); + auto filter = try_get_product_filter(request->filter_id()); if (filter) { filter->set_synced_size(request->filter_size()); return Status::OK(); @@ -397,9 +393,8 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ return Status::OK(); } MergeRuntimeFilterParams params(request, attach_data); - ObjectPool* pool = cnt_val->pool.get(); RuntimeFilterWrapperHolder holder; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, pool, holder.getHandle())); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, holder.getHandle())); auto st = cnt_val->filter->merge_from(holder.getHandle()->get()); if (!st) { diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index 9b0216e07786d6..d89a3b9f1b1768 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -59,7 +59,7 @@ struct LocalMergeFilters { int merge_time = 0; int merge_size_times = 0; uint64_t local_merged_size = 0; - std::vector filters; + std::vector> filters; }; /// producer: @@ -81,9 +81,10 @@ class RuntimeFilterMgr { ~RuntimeFilterMgr(); - Status get_consume_filters(const int filter_id, std::vector& consumer_filters); + Status get_consume_filters(const int filter_id, + std::vector>& consumer_filters); - IRuntimeFilter* try_get_product_filter(const int filter_id) { + std::shared_ptr try_get_product_filter(const int filter_id) { std::lock_guard l(_lock); auto iter = _producer_map.find(filter_id); if (iter == _producer_map.end()) { @@ -94,18 +95,18 @@ class RuntimeFilterMgr { // register filter Status register_consumer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - int node_id, IRuntimeFilter** consumer_filter, + int node_id, std::shared_ptr* consumer_filter, bool build_bf_exactly = false, bool need_local_merge = false); Status register_local_merge_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - IRuntimeFilter** producer_filter, + std::shared_ptr* producer_filter, bool build_bf_exactly = false); Status get_local_merge_producer_filters(int filter_id, LocalMergeFilters** local_merge_filters); Status register_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - IRuntimeFilter** producer_filter, + std::shared_ptr* producer_filter, bool build_bf_exactly = false); // update filter by remote @@ -121,13 +122,13 @@ class RuntimeFilterMgr { private: struct ConsumerFilterHolder { int node_id; - IRuntimeFilter* filter = nullptr; + std::shared_ptr filter; }; // RuntimeFilterMgr is owned by RuntimeState, so we only // use filter_id as key // key: "filter-id" std::map> _consumer_map; - std::map _producer_map; + std::map> _producer_map; std::map _local_merge_producer_map; RuntimeFilterParamsContext* _state = nullptr; diff --git a/be/src/runtime/runtime_predicate.cpp b/be/src/runtime/runtime_predicate.cpp index b746ffabb3b23b..23130811a8ccfb 100644 --- a/be/src/runtime/runtime_predicate.cpp +++ b/be/src/runtime/runtime_predicate.cpp @@ -92,6 +92,12 @@ std::string get_datetime_value(const Field& field) { return cast_to_string(value, 0); } +std::string get_time_value(const Field& field) { + using ValueType = typename PrimitiveTypeTraits::CppType; + ValueType value = field.get(); + return cast_to_string(value, 0); +} + std::string get_decimalv2_value(const Field& field) { // can NOT use PrimitiveTypeTraits::CppType since // it is DecimalV2Value and Decimal128V2 can not convert to it implicitly @@ -158,6 +164,10 @@ bool RuntimePredicate::_init(PrimitiveType type) { _get_value_fn = get_datetime_value; break; } + case PrimitiveType::TYPE_TIMEV2: { + _get_value_fn = get_time_value; + break; + } case PrimitiveType::TYPE_DECIMAL32: { _get_value_fn = get_decimal_value; break; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 48d71a64eb2557..7ccf73e2f0d506 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -70,7 +71,6 @@ RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, _num_finished_scan_range(0), _normal_row_number(0), _error_row_number(0), - _error_log_file(nullptr), _query_ctx(ctx) { Status status = init(fragment_exec_params.fragment_instance_id, query_options, query_globals, exec_env); @@ -87,11 +87,6 @@ RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - if (ctx) { - _runtime_filter_mgr = std::make_unique( - fragment_exec_params.query_id, RuntimeFilterParamsContext::create(this), - _query_mem_tracker); - } if (fragment_exec_params.__isset.runtime_filter_params) { _query_ctx->runtime_filter_mgr()->set_runtime_filter_params( fragment_exec_params.runtime_filter_params); @@ -117,7 +112,6 @@ RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_ _num_finished_scan_range(0), _normal_row_number(0), _error_row_number(0), - _error_log_file(nullptr), _query_ctx(ctx) { [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); DCHECK(status.ok()); @@ -128,8 +122,6 @@ RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_ } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - _runtime_filter_mgr.reset(new RuntimeFilterMgr( - query_id, RuntimeFilterParamsContext::create(this), _query_mem_tracker)); } RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& instance_id, @@ -153,7 +145,6 @@ RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& _num_finished_scan_range(0), _normal_row_number(0), _error_row_number(0), - _error_log_file(nullptr), _query_ctx(ctx) { [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); _query_mem_tracker = ctx->query_mem_tracker; @@ -185,7 +176,6 @@ RuntimeState::RuntimeState(const TUniqueId& query_id, int32_t fragment_id, _num_finished_scan_range(0), _normal_row_number(0), _error_row_number(0), - _error_log_file(nullptr), _query_ctx(ctx) { // TODO: do we really need instance id? Status status = init(TUniqueId(), query_options, query_globals, exec_env); @@ -197,8 +187,6 @@ RuntimeState::RuntimeState(const TUniqueId& query_id, int32_t fragment_id, } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - _runtime_filter_mgr.reset(new RuntimeFilterMgr( - query_id, RuntimeFilterParamsContext::create(this), _query_mem_tracker)); } RuntimeState::RuntimeState(const TQueryGlobals& query_globals) @@ -255,26 +243,9 @@ RuntimeState::~RuntimeState() { // close error log file if (_error_log_file != nullptr && _error_log_file->is_open()) { _error_log_file->close(); - delete _error_log_file; - _error_log_file = nullptr; - if (_s3_error_fs) { - std::string error_log_absolute_path = - _exec_env->load_path_mgr()->get_load_error_absolute_path(_error_log_file_path); - // upload error log file to s3 - Status st = _s3_error_fs->upload(error_log_absolute_path, _s3_error_log_file_path); - if (st.ok()) { - // remove local error log file - std::filesystem::remove(error_log_absolute_path); - } else { - // remove local error log file later by clean_expired_temp_path thread - LOG(WARNING) << "Fail to upload error file to s3, error_log_file_path=" - << _error_log_file_path << ", error=" << st; - } - } } _obj_pool->clear(); - _runtime_filter_mgr.reset(); } Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, @@ -394,7 +365,7 @@ Status RuntimeState::create_error_log_file() { _db_name, _import_label, _fragment_instance_id, &_error_log_file_path)); std::string error_log_absolute_path = _exec_env->load_path_mgr()->get_load_error_absolute_path(_error_log_file_path); - _error_log_file = new std::ofstream(error_log_absolute_path, std::ifstream::out); + _error_log_file = std::make_unique(error_log_absolute_path, std::ifstream::out); if (!_error_log_file->is_open()) { std::stringstream error_msg; error_msg << "Fail to open error file: [" << _error_log_file_path << "]."; @@ -420,8 +391,6 @@ Status RuntimeState::append_error_msg_to_file(std::function line, LOG(WARNING) << "Create error file log failed. because: " << status; if (_error_log_file != nullptr) { _error_log_file->close(); - delete _error_log_file; - _error_log_file = nullptr; } return status; } @@ -464,13 +433,28 @@ Status RuntimeState::append_error_msg_to_file(std::function line, return Status::OK(); } -std::string RuntimeState::get_error_log_file_path() const { - if (_s3_error_fs) { +std::string RuntimeState::get_error_log_file_path() { + if (_s3_error_fs && _error_log_file && _error_log_file->is_open()) { + // close error log file + _error_log_file->close(); + std::string error_log_absolute_path = + _exec_env->load_path_mgr()->get_load_error_absolute_path(_error_log_file_path); + // upload error log file to s3 + Status st = _s3_error_fs->upload(error_log_absolute_path, _s3_error_log_file_path); + if (st.ok()) { + // remove local error log file + std::filesystem::remove(error_log_absolute_path); + } else { + // upload failed and return local error log file path + LOG(WARNING) << "Fail to upload error file to s3, error_log_file_path=" + << _error_log_file_path << ", error=" << st; + return _error_log_file_path; + } // expiration must be less than a week (in seconds) for presigned url static const unsigned EXPIRATION_SECONDS = 7 * 24 * 60 * 60 - 1; // We should return a public endpoint to user. - return _s3_error_fs->generate_presigned_url(_s3_error_log_file_path, EXPIRATION_SECONDS, - true); + _error_log_file_path = _s3_error_fs->generate_presigned_url(_s3_error_log_file_path, + EXPIRATION_SECONDS, true); } return _error_log_file_path; } @@ -539,10 +523,9 @@ RuntimeFilterMgr* RuntimeState::global_runtime_filter_mgr() { return _query_ctx->runtime_filter_mgr(); } -Status RuntimeState::register_producer_runtime_filter(const doris::TRuntimeFilterDesc& desc, - bool need_local_merge, - doris::IRuntimeFilter** producer_filter, - bool build_bf_exactly) { +Status RuntimeState::register_producer_runtime_filter( + const TRuntimeFilterDesc& desc, bool need_local_merge, + std::shared_ptr* producer_filter, bool build_bf_exactly) { if (desc.has_remote_targets || need_local_merge) { return global_runtime_filter_mgr()->register_local_merge_producer_filter( desc, query_options(), producer_filter, build_bf_exactly); @@ -552,9 +535,9 @@ Status RuntimeState::register_producer_runtime_filter(const doris::TRuntimeFilte } } -Status RuntimeState::register_consumer_runtime_filter(const doris::TRuntimeFilterDesc& desc, - bool need_local_merge, int node_id, - doris::IRuntimeFilter** consumer_filter) { +Status RuntimeState::register_consumer_runtime_filter( + const doris::TRuntimeFilterDesc& desc, bool need_local_merge, int node_id, + std::shared_ptr* consumer_filter) { if (desc.has_remote_targets || need_local_merge) { return global_runtime_filter_mgr()->register_consumer_filter(desc, query_options(), node_id, consumer_filter, false, true); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index e89e7be66f5744..d0c2f93103e8b5 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -263,7 +263,7 @@ class RuntimeState { int64_t load_job_id() const { return _load_job_id; } - std::string get_error_log_file_path() const; + std::string get_error_log_file_path(); // append error msg and error line to file when loading data. // is_summary is true, means we are going to write the summary line @@ -344,8 +344,6 @@ class RuntimeState { return _query_options.disable_stream_preaggregations; } - bool enable_spill() const { return _query_options.enable_spilling; } - int32_t runtime_filter_wait_time_ms() const { return _query_options.runtime_filter_wait_time_ms; } @@ -446,18 +444,12 @@ class RuntimeState { // local runtime filter mgr, the runtime filter do not have remote target or // not need local merge should regist here. the instance exec finish, the local // runtime filter mgr can release the memory of local runtime filter - RuntimeFilterMgr* local_runtime_filter_mgr() { - if (_pipeline_x_runtime_filter_mgr) { - return _pipeline_x_runtime_filter_mgr; - } else { - return _runtime_filter_mgr.get(); - } - } + RuntimeFilterMgr* local_runtime_filter_mgr() { return _runtime_filter_mgr; } RuntimeFilterMgr* global_runtime_filter_mgr(); - void set_pipeline_x_runtime_filter_mgr(RuntimeFilterMgr* pipeline_x_runtime_filter_mgr) { - _pipeline_x_runtime_filter_mgr = pipeline_x_runtime_filter_mgr; + void set_runtime_filter_mgr(RuntimeFilterMgr* runtime_filter_mgr) { + _runtime_filter_mgr = runtime_filter_mgr; } QueryContext* get_query_ctx() { return _query_ctx; } @@ -508,17 +500,6 @@ class RuntimeState { : 0; } - int repeat_max_num() const { -#ifndef BE_TEST - if (!_query_options.__isset.repeat_max_num) { - return 10000; - } - return _query_options.repeat_max_num; -#else - return 10; -#endif - } - int64_t external_sort_bytes_threshold() const { if (_query_options.__isset.external_sort_bytes_threshold) { return _query_options.external_sort_bytes_threshold; @@ -528,12 +509,6 @@ class RuntimeState { void set_be_exec_version(int32_t version) noexcept { _query_options.be_exec_version = version; } - int64_t external_agg_bytes_threshold() const { - return _query_options.__isset.external_agg_bytes_threshold - ? _query_options.external_agg_bytes_threshold - : 0; - } - inline bool enable_delete_sub_pred_v2() const { return _query_options.__isset.enable_delete_sub_predicate_v2 && _query_options.enable_delete_sub_predicate_v2; @@ -572,12 +547,12 @@ class RuntimeState { Status register_producer_runtime_filter(const doris::TRuntimeFilterDesc& desc, bool need_local_merge, - doris::IRuntimeFilter** producer_filter, + std::shared_ptr* producer_filter, bool build_bf_exactly); Status register_consumer_runtime_filter(const doris::TRuntimeFilterDesc& desc, bool need_local_merge, int node_id, - doris::IRuntimeFilter** producer_filter); + std::shared_ptr* producer_filter); bool is_nereids() const; bool enable_join_spill() const { @@ -606,9 +581,9 @@ class RuntimeState { int64_t min_revocable_mem() const { if (_query_options.__isset.min_revocable_mem) { - return _query_options.min_revocable_mem; + return std::max(_query_options.min_revocable_mem, (int64_t)1); } - return 0; + return 1; } void set_max_operator_id(int max_operator_id) { _max_operator_id = max_operator_id; } @@ -653,11 +628,8 @@ class RuntimeState { const DescriptorTbl* _desc_tbl = nullptr; std::shared_ptr _obj_pool; - // runtime filter - std::unique_ptr _runtime_filter_mgr; - // owned by PipelineFragmentContext - RuntimeFilterMgr* _pipeline_x_runtime_filter_mgr = nullptr; + RuntimeFilterMgr* _runtime_filter_mgr = nullptr; // Lock protecting _error_log and _unreported_error_idx std::mutex _error_log_lock; @@ -721,7 +693,7 @@ class RuntimeState { int64_t _normal_row_number; int64_t _error_row_number; std::string _error_log_file_path; - std::ofstream* _error_log_file = nullptr; // error file path, absolute path + std::unique_ptr _error_log_file; // error file path, absolute path std::vector _tablet_commit_infos; std::vector _error_tablet_infos; int _max_operator_id = 0; diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index b840636a46f0fb..f34dfde229abc3 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -458,7 +458,6 @@ Status SnapshotLoader::remote_http_download( } // Step 3: Validate remote tablet snapshot paths && remote files map - // TODO(Drogon): Add md5sum check // key is remote snapshot paths, value is filelist // get all these use http download action // http://172.16.0.14:6781/api/_tablet/_download?token=e804dd27-86da-4072-af58-70724075d2a4&file=/home/ubuntu/doris_master/output/be/storage/snapshot/20230410102306.9.180//2774718/217609978/2774718.hdr diff --git a/be/src/runtime/stream_load/stream_load_context.h b/be/src/runtime/stream_load/stream_load_context.h index db210b350e723b..633c3af428b94e 100644 --- a/be/src/runtime/stream_load/stream_load_context.h +++ b/be/src/runtime/stream_load/stream_load_context.h @@ -123,7 +123,7 @@ class StreamLoadContext { public: static const int default_txn_id = -1; // load type, eg: ROUTINE LOAD/MANUAL LOAD - TLoadType::type load_type; + TLoadType::type load_type = TLoadType::type::MANUL_LOAD; // load data source: eg: KAFKA/RAW TLoadSourceType::type load_src_type; diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 2bd1c16199dd19..28b0556aafdd2c 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -90,9 +90,7 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptrnumber_filtered_rows > 0 && !state->get_error_log_file_path().empty()) { - ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); - } + ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); if (status->ok()) { DorisMetrics::instance()->stream_receive_bytes_total->increment(ctx->receive_bytes); diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index 11ddf27cfcdb8e..eac9693d8390da 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -133,8 +133,9 @@ Status BaseTabletsChannel::open(const PTabletWriterOpenRequest& request) { if (_state == kOpened || _state == kFinished) { return Status::OK(); } - LOG(INFO) << fmt::format("open tablets channel of index {}, tablets num: {} timeout(s): {}", - _index_id, request.tablets().size(), request.load_channel_timeout_s()); + LOG(INFO) << fmt::format("open tablets channel {}, tablets num: {} timeout(s): {}", + _key.to_string(), request.tablets().size(), + request.load_channel_timeout_s()); _txn_id = request.txn_id(); _index_id = request.index_id(); _schema = std::make_shared(); @@ -215,6 +216,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para ss << "LocalTabletsChannel txn_id: " << _txn_id << " load_id: " << print_id(params.id()) << " incremental open delta writer: "; + // every change will hold _lock. this find in under _lock too. so no need _tablet_writers_lock again. for (const auto& tablet : params.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -237,6 +239,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para auto delta_writer = create_delta_writer(wrequest); { + // here we modify _tablet_writers. so need lock. std::lock_guard l(_tablet_writers_lock); _tablet_writers.emplace(tablet.tablet_id(), std::move(delta_writer)); } @@ -291,6 +294,7 @@ Status TabletsChannel::close(LoadChannel* parent, const PTabletWriterAddBlockReq // All senders are closed // 1. close all delta writers std::set need_wait_writers; + // under _lock. no need _tablet_writers_lock again. for (auto&& [tablet_id, writer] : _tablet_writers) { if (_partition_ids.contains(writer->partition_id())) { auto st = writer->close(); @@ -492,6 +496,7 @@ Status BaseTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& req #endif int tablet_cnt = 0; + // under _lock. no need _tablet_writers_lock again. for (const auto& tablet : request.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -574,6 +579,11 @@ Status BaseTabletsChannel::_write_block_data( std::function write_func) { google::protobuf::RepeatedPtrField* tablet_errors = response->mutable_tablet_errors(); + + // add_batch may concurrency with inc_open but not under _lock. + // so need to protect it with _tablet_writers_lock. + std::lock_guard l(_tablet_writers_lock); + auto tablet_writer_it = _tablet_writers.find(tablet_id); if (tablet_writer_it == _tablet_writers.end()) { return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index 48e987341587d7..87fbf9d06aaaa7 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -143,11 +143,8 @@ class BaseTabletsChannel { // id of this load channel TabletsChannelKey _key; - // make execute sequence + // protect _state change. open and close. when add_batch finished, lock to change _next_seqs also std::mutex _lock; - - SpinLock _tablet_writers_lock; - enum State { kInitialized, kOpened, @@ -173,8 +170,10 @@ class BaseTabletsChannel { // currently it's OK. Status _close_status; - // tablet_id -> TabletChannel + // tablet_id -> TabletChannel. it will only be changed in open() or inc_open() std::unordered_map> _tablet_writers; + // protect _tablet_writers + SpinLock _tablet_writers_lock; // broken tablet ids. // If a tablet write fails, it's id will be added to this set. // So that following batch will not handle this tablet anymore. diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_context.cpp index 6f69eb9e13445f..c89f532e5927a6 100644 --- a/be/src/runtime/thread_context.cpp +++ b/be/src/runtime/thread_context.cpp @@ -18,7 +18,9 @@ #include "runtime/thread_context.h" #include "common/signal_handler.h" +#include "runtime/query_context.h" #include "runtime/runtime_state.h" +#include "runtime/workload_group/workload_group_manager.h" namespace doris { class MemTracker; @@ -26,34 +28,38 @@ class MemTracker; QueryThreadContext ThreadContext::query_thread_context() { DCHECK(doris::pthread_context_ptr_init); ORPHAN_TRACKER_CHECK(); - return {_task_id, thread_mem_tracker_mgr->limiter_mem_tracker()}; + return {_task_id, thread_mem_tracker_mgr->limiter_mem_tracker(), _wg_wptr}; } -AttachTask::AttachTask(const std::shared_ptr& mem_tracker, - const TUniqueId& task_id) { +void AttachTask::init(const QueryThreadContext& query_thread_context) { ThreadLocalHandle::create_thread_local_if_not_exits(); - signal::set_signal_task_id(task_id); - thread_context()->attach_task(task_id, mem_tracker); + signal::set_signal_task_id(query_thread_context.query_id); + thread_context()->attach_task(query_thread_context.query_id, + query_thread_context.query_mem_tracker, + query_thread_context.wg_wptr); } AttachTask::AttachTask(const std::shared_ptr& mem_tracker) { - ThreadLocalHandle::create_thread_local_if_not_exits(); - signal::set_signal_task_id(TUniqueId()); - thread_context()->attach_task(TUniqueId(), mem_tracker); + QueryThreadContext query_thread_context = {TUniqueId(), mem_tracker}; + init(query_thread_context); } AttachTask::AttachTask(RuntimeState* runtime_state) { - ThreadLocalHandle::create_thread_local_if_not_exits(); - signal::set_signal_task_id(runtime_state->query_id()); signal::set_signal_is_nereids(runtime_state->is_nereids()); - thread_context()->attach_task(runtime_state->query_id(), runtime_state->query_mem_tracker()); + QueryThreadContext query_thread_context = {runtime_state->query_id(), + runtime_state->query_mem_tracker(), + runtime_state->get_query_ctx()->workload_group()}; + init(query_thread_context); } AttachTask::AttachTask(const QueryThreadContext& query_thread_context) { - ThreadLocalHandle::create_thread_local_if_not_exits(); - signal::set_signal_task_id(query_thread_context.query_id); - thread_context()->attach_task(query_thread_context.query_id, - query_thread_context.query_mem_tracker); + init(query_thread_context); +} + +AttachTask::AttachTask(QueryContext* query_ctx) { + QueryThreadContext query_thread_context = {query_ctx->query_id(), query_ctx->query_mem_tracker, + query_ctx->workload_group()}; + init(query_thread_context); } AttachTask::~AttachTask() { diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 45f64a3739ae1a..b009affa53fc7c 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -45,8 +45,6 @@ // This will save some info about a working thread in the thread context. // Looking forward to tracking memory during thread execution into MemTrackerLimiter. #define SCOPED_ATTACH_TASK(arg1) auto VARNAME_LINENUM(attach_task) = AttachTask(arg1) -#define SCOPED_ATTACH_TASK_WITH_ID(arg1, arg2) \ - auto VARNAME_LINENUM(attach_task) = AttachTask(arg1, arg2) // Switch MemTrackerLimiter for count memory during thread execution. // Used after SCOPED_ATTACH_TASK, in order to count the memory into another @@ -74,8 +72,6 @@ // thread context need to be initialized, required by Allocator and elsewhere. #define SCOPED_ATTACH_TASK(arg1, ...) \ auto VARNAME_LINENUM(scoped_tls_at) = doris::ScopedInitThreadContext() -#define SCOPED_ATTACH_TASK_WITH_ID(arg1, arg2) \ - auto VARNAME_LINENUM(scoped_tls_atwi) = doris::ScopedInitThreadContext() #define SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(arg1) \ auto VARNAME_LINENUM(scoped_tls_stmtl) = doris::ScopedInitThreadContext() #define SCOPED_CONSUME_MEM_TRACKER(mem_tracker) \ @@ -124,6 +120,7 @@ class ThreadContext; class MemTracker; class RuntimeState; class QueryThreadContext; +class WorkloadGroup; extern bthread_key_t btls_key; @@ -158,7 +155,8 @@ class ThreadContext { ~ThreadContext() = default; void attach_task(const TUniqueId& task_id, - const std::shared_ptr& mem_tracker) { + const std::shared_ptr& mem_tracker, + const std::weak_ptr& wg_wptr) { // will only attach_task at the beginning of the thread function, there should be no duplicate attach_task. DCHECK(mem_tracker); // Orphan is thread default tracker. @@ -166,16 +164,20 @@ class ThreadContext { << ", thread mem tracker label: " << thread_mem_tracker()->label() << ", attach mem tracker label: " << mem_tracker->label(); _task_id = task_id; + _wg_wptr = wg_wptr; thread_mem_tracker_mgr->attach_limiter_tracker(mem_tracker); thread_mem_tracker_mgr->set_query_id(_task_id); + thread_mem_tracker_mgr->set_wg_wptr(_wg_wptr); thread_mem_tracker_mgr->enable_wait_gc(); thread_mem_tracker_mgr->reset_query_cancelled_flag(false); } void detach_task() { _task_id = TUniqueId(); + _wg_wptr.reset(); thread_mem_tracker_mgr->detach_limiter_tracker(); thread_mem_tracker_mgr->set_query_id(TUniqueId()); + thread_mem_tracker_mgr->reset_wg_wptr(); thread_mem_tracker_mgr->disable_wait_gc(); } @@ -226,12 +228,22 @@ class ThreadContext { thread_mem_tracker_mgr->release_reserved(); } + std::weak_ptr workload_group() { return _wg_wptr; } + + std::shared_ptr io_throttle(const std::string& data_dir) { + if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { + return wg_ptr->get_scan_io_throttle(data_dir); + } + return nullptr; + } + int thread_local_handle_count = 0; int skip_memory_check = 0; int skip_large_memory_check = 0; private: TUniqueId _task_id; + std::weak_ptr _wg_wptr; }; class ThreadLocalHandle { @@ -290,7 +302,7 @@ class ThreadLocalHandle { }; // must call create_thread_local_if_not_exits() before use thread_context(). -static ThreadContext* thread_context() { +static ThreadContext* thread_context(bool allow_return_null = false) { if (pthread_context_ptr_init) { // in pthread DCHECK(bthread_self() == 0); @@ -304,6 +316,9 @@ static ThreadContext* thread_context() { DCHECK(bthread_context != nullptr); return bthread_context; } + if (allow_return_null) { + return nullptr; + } // It means that use thread_context() but this thread not attached a query/load using SCOPED_ATTACH_TASK macro. LOG(FATAL) << "__builtin_unreachable, " << doris::memory_orphan_check_msg; __builtin_unreachable(); @@ -313,6 +328,11 @@ static ThreadContext* thread_context() { class QueryThreadContext { public: QueryThreadContext() = default; + QueryThreadContext(const TUniqueId& query_id, + const std::shared_ptr& mem_tracker, + const std::weak_ptr& wg_wptr) + : query_id(query_id), query_mem_tracker(mem_tracker), wg_wptr(wg_wptr) {} + // If use WorkloadGroup and can get WorkloadGroup ptr, must as a parameter. QueryThreadContext(const TUniqueId& query_id, const std::shared_ptr& mem_tracker) : query_id(query_id), query_mem_tracker(mem_tracker) {} @@ -324,6 +344,7 @@ class QueryThreadContext { ORPHAN_TRACKER_CHECK(); query_id = doris::thread_context()->task_id(); query_mem_tracker = doris::thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker(); + wg_wptr = doris::thread_context()->workload_group(); #else query_id = TUniqueId(); query_mem_tracker = doris::ExecEnv::GetInstance()->orphan_mem_tracker(); @@ -332,6 +353,7 @@ class QueryThreadContext { TUniqueId query_id; std::shared_ptr query_mem_tracker; + std::weak_ptr wg_wptr; }; class ScopeMemCountByHook { @@ -363,15 +385,18 @@ class ScopedInitThreadContext { class AttachTask { public: - explicit AttachTask(const std::shared_ptr& mem_tracker, - const TUniqueId& task_id); - + // not query or load, initialize with memory tracker, empty query id and default normal workload group. explicit AttachTask(const std::shared_ptr& mem_tracker); + // is query or load, initialize with memory tracker, query id and workload group wptr. explicit AttachTask(RuntimeState* runtime_state); + explicit AttachTask(QueryContext* query_ctx); + explicit AttachTask(const QueryThreadContext& query_thread_context); + void init(const QueryThreadContext& query_thread_context); + ~AttachTask(); }; @@ -386,7 +411,8 @@ class SwitchThreadMemTrackerLimiter { explicit SwitchThreadMemTrackerLimiter(const QueryThreadContext& query_thread_context) { ThreadLocalHandle::create_thread_local_if_not_exits(); - DCHECK(thread_context()->task_id() == query_thread_context.query_id); + DCHECK(thread_context()->task_id() == + query_thread_context.query_id); // workload group alse not change DCHECK(query_thread_context.query_mem_tracker); _old_mem_tracker = thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker(); thread_context()->thread_mem_tracker_mgr->attach_limiter_tracker( diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index 64a5c7aeffb8b6..29c6fc7ae1b80d 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -27,12 +27,14 @@ #include #include "common/logging.h" +#include "io/fs/local_file_reader.h" #include "olap/storage_engine.h" #include "pipeline/task_queue.h" #include "pipeline/task_scheduler.h" #include "runtime/exec_env.h" #include "runtime/memory/global_memory_arbitrator.h" #include "runtime/memory/mem_tracker_limiter.h" +#include "runtime/workload_management/io_throttle.h" #include "util/mem_info.h" #include "util/parse_util.h" #include "util/runtime_profile.h" @@ -62,7 +64,15 @@ WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) _max_remote_scan_thread_num(tg_info.max_remote_scan_thread_num), _min_remote_scan_thread_num(tg_info.min_remote_scan_thread_num), _spill_low_watermark(tg_info.spill_low_watermark), - _spill_high_watermark(tg_info.spill_high_watermark) {} + _spill_high_watermark(tg_info.spill_high_watermark), + _scan_bytes_per_second(tg_info.read_bytes_per_second), + _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second) { + std::vector& data_dir_list = io::BeConfDataDirReader::be_config_data_dir_list; + for (const auto& data_dir : data_dir_list) { + _scan_io_throttle_map[data_dir.path] = std::make_shared(); + } + _remote_scan_io_throttle = std::make_shared(); +} std::string WorkloadGroup::debug_string() const { std::shared_lock rl {_mutex}; @@ -70,11 +80,13 @@ std::string WorkloadGroup::debug_string() const { "TG[id = {}, name = {}, cpu_share = {}, memory_limit = {}, enable_memory_overcommit = " "{}, version = {}, cpu_hard_limit = {}, scan_thread_num = " "{}, max_remote_scan_thread_num = {}, min_remote_scan_thread_num = {}, " - "spill_low_watermark={}, spill_high_watermark={}, is_shutdown={}, query_num={}]", + "spill_low_watermark={}, spill_high_watermark={}, is_shutdown={}, query_num={}, " + "read_bytes_per_second={}, remote_read_bytes_per_second={}]", _id, _name, cpu_share(), PrettyPrinter::print(_memory_limit, TUnit::BYTES), _enable_memory_overcommit ? "true" : "false", _version, cpu_hard_limit(), _scan_thread_num, _max_remote_scan_thread_num, _min_remote_scan_thread_num, - _spill_low_watermark, _spill_high_watermark, _is_shutdown, _query_ctxs.size()); + _spill_low_watermark, _spill_high_watermark, _is_shutdown, _query_ctxs.size(), + _scan_bytes_per_second, _remote_scan_bytes_per_second); } void WorkloadGroup::check_and_update(const WorkloadGroupInfo& tg_info) { @@ -101,45 +113,63 @@ void WorkloadGroup::check_and_update(const WorkloadGroupInfo& tg_info) { _min_remote_scan_thread_num = tg_info.min_remote_scan_thread_num; _spill_low_watermark = tg_info.spill_low_watermark; _spill_high_watermark = tg_info.spill_high_watermark; + _scan_bytes_per_second = tg_info.read_bytes_per_second; + _remote_scan_bytes_per_second = tg_info.remote_read_bytes_per_second; } else { return; } } } -int64_t WorkloadGroup::memory_used() { +int64_t WorkloadGroup::make_memory_tracker_snapshots( + std::list>* tracker_snapshots) { int64_t used_memory = 0; for (auto& mem_tracker_group : _mem_tracker_limiter_pool) { std::lock_guard l(mem_tracker_group.group_lock); for (const auto& trackerWptr : mem_tracker_group.trackers) { auto tracker = trackerWptr.lock(); CHECK(tracker != nullptr); + if (tracker_snapshots != nullptr) { + tracker_snapshots->insert(tracker_snapshots->end(), tracker); + } used_memory += tracker->consumption(); } } + refresh_memory(used_memory); return used_memory; } -void WorkloadGroup::set_weighted_memory_used(int64_t wg_total_mem_used, double ratio) { - _weighted_mem_used.store(int64_t(wg_total_mem_used * ratio), std::memory_order_relaxed); +int64_t WorkloadGroup::memory_used() { + return make_memory_tracker_snapshots(nullptr); +} + +void WorkloadGroup::refresh_memory(int64_t used_memory) { + // refresh total memory used. + _total_mem_used = used_memory; + // reserve memory is recorded in the query mem tracker + // and _total_mem_used already contains all the current reserve memory. + // so after refreshing _total_mem_used, reset _wg_refresh_interval_memory_growth. + _wg_refresh_interval_memory_growth.store(0.0); } void WorkloadGroup::add_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { + std::unique_lock wlock(_mutex); auto group_num = mem_tracker_ptr->group_num(); std::lock_guard l(_mem_tracker_limiter_pool[group_num].group_lock); - mem_tracker_ptr->tg_tracker_limiter_group_it = + mem_tracker_ptr->wg_tracker_limiter_group_it = _mem_tracker_limiter_pool[group_num].trackers.insert( _mem_tracker_limiter_pool[group_num].trackers.end(), mem_tracker_ptr); } void WorkloadGroup::remove_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { + std::unique_lock wlock(_mutex); auto group_num = mem_tracker_ptr->group_num(); std::lock_guard l(_mem_tracker_limiter_pool[group_num].group_lock); - if (mem_tracker_ptr->tg_tracker_limiter_group_it != + if (mem_tracker_ptr->wg_tracker_limiter_group_it != _mem_tracker_limiter_pool[group_num].trackers.end()) { _mem_tracker_limiter_pool[group_num].trackers.erase( - mem_tracker_ptr->tg_tracker_limiter_group_it); - mem_tracker_ptr->tg_tracker_limiter_group_it = + mem_tracker_ptr->wg_tracker_limiter_group_it); + mem_tracker_ptr->wg_tracker_limiter_group_it = _mem_tracker_limiter_pool[group_num].trackers.end(); } } @@ -252,7 +282,7 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( if (tworkload_group_info.__isset.id) { tg_id = tworkload_group_info.id; } else { - return {.valid = false}; + return {.name = "", .valid = false}; } // 2 name @@ -266,7 +296,7 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( if (tworkload_group_info.__isset.version) { version = tworkload_group_info.version; } else { - return {.valid = false}; + return {.name {}, .valid = false}; } // 4 cpu_share @@ -316,7 +346,7 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( } // 11 min remote scan thread num - int min_remote_scan_thread_num = vectorized::ScannerScheduler::get_remote_scan_thread_num(); + int min_remote_scan_thread_num = config::doris_scanner_min_thread_pool_thread_num; if (tworkload_group_info.__isset.min_remote_scan_thread_num && tworkload_group_info.min_remote_scan_thread_num > 0) { min_remote_scan_thread_num = tworkload_group_info.min_remote_scan_thread_num; @@ -334,19 +364,33 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( spill_high_watermark = tworkload_group_info.spill_threshold_high_watermark; } - return {tg_id, - name, - cpu_share, - mem_limit, - enable_memory_overcommit, - version, - cpu_hard_limit, - enable_cpu_hard_limit, - scan_thread_num, - max_remote_scan_thread_num, - min_remote_scan_thread_num, - spill_low_watermark, - spill_high_watermark}; + // 14 scan io + int read_bytes_per_second = -1; + if (tworkload_group_info.__isset.read_bytes_per_second) { + read_bytes_per_second = tworkload_group_info.read_bytes_per_second; + } + + // 15 remote scan io + int remote_read_bytes_per_second = -1; + if (tworkload_group_info.__isset.remote_read_bytes_per_second) { + remote_read_bytes_per_second = tworkload_group_info.remote_read_bytes_per_second; + } + + return {.id = tg_id, + .name = name, + .cpu_share = cpu_share, + .memory_limit = mem_limit, + .enable_memory_overcommit = enable_memory_overcommit, + .version = version, + .cpu_hard_limit = cpu_hard_limit, + .enable_cpu_hard_limit = enable_cpu_hard_limit, + .scan_thread_num = scan_thread_num, + .max_remote_scan_thread_num = max_remote_scan_thread_num, + .min_remote_scan_thread_num = min_remote_scan_thread_num, + .spill_low_watermark = spill_low_watermark, + .spill_high_watermark = spill_high_watermark, + .read_bytes_per_second = read_bytes_per_second, + .remote_read_bytes_per_second = remote_read_bytes_per_second}; } void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env) { @@ -415,7 +459,8 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e std::unique_ptr remote_scan_scheduler = std::make_unique("RScan_" + tg_name, cg_cpu_ctl_ptr); - Status ret = remote_scan_scheduler->start(remote_max_thread_num, remote_max_thread_num, + Status ret = remote_scan_scheduler->start(remote_max_thread_num, + config::doris_scanner_min_thread_pool_thread_num, remote_scan_thread_queue_size); if (ret.ok()) { _remote_scan_task_sched = std::move(remote_scan_scheduler); @@ -520,6 +565,25 @@ std::string WorkloadGroup::thread_debug_info() { return str; } +void WorkloadGroup::upsert_scan_io_throttle(WorkloadGroupInfo* tg_info) { + for (const auto& [key, io_throttle] : _scan_io_throttle_map) { + io_throttle->set_io_bytes_per_second(tg_info->read_bytes_per_second); + } + + _remote_scan_io_throttle->set_io_bytes_per_second(tg_info->remote_read_bytes_per_second); +} + +std::shared_ptr WorkloadGroup::get_scan_io_throttle(const std::string& disk_dir) { + if (disk_dir == io::FileReader::VIRTUAL_REMOTE_DATA_DIR) { + return _remote_scan_io_throttle; + } + auto find_ret = _scan_io_throttle_map.find(disk_dir); + if (find_ret != _scan_io_throttle_map.end()) { + return find_ret->second; + } + return nullptr; +} + void WorkloadGroup::try_stop_schedulers() { std::lock_guard wlock(_task_sched_lock); if (_task_sched) { diff --git a/be/src/runtime/workload_group/workload_group.h b/be/src/runtime/workload_group/workload_group.h index a82efab09043a3..6b87a1cf7e5e69 100644 --- a/be/src/runtime/workload_group/workload_group.h +++ b/be/src/runtime/workload_group/workload_group.h @@ -40,6 +40,7 @@ class ThreadPool; class ExecEnv; class CgroupCpuCtl; class QueryContext; +class IOThrottle; namespace vectorized { class SimplifiedScanScheduler; @@ -76,7 +77,18 @@ class WorkloadGroup : public std::enable_shared_from_this { return _memory_limit; }; + int64_t weighted_memory_limit() const { return _weighted_memory_limit; }; + + void set_weighted_memory_limit(int64_t weighted_memory_limit) { + _weighted_memory_limit = weighted_memory_limit; + } + + // make memory snapshots and refresh total memory used at the same time. + int64_t make_memory_tracker_snapshots( + std::list>* tracker_snapshots); + // call make_memory_tracker_snapshots, so also refresh total memory used. int64_t memory_used(); + void refresh_memory(int64_t used_memory); int spill_threshold_low_water_mark() const { return _spill_low_watermark.load(std::memory_order_relaxed); @@ -85,16 +97,30 @@ class WorkloadGroup : public std::enable_shared_from_this { return _spill_high_watermark.load(std::memory_order_relaxed); } - void set_weighted_memory_used(int64_t wg_total_mem_used, double ratio); + void set_weighted_memory_ratio(double ratio); + bool add_wg_refresh_interval_memory_growth(int64_t size) { + auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); + if ((realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_high_watermark.load(std::memory_order_relaxed) / 100))) { + return false; + } else { + _wg_refresh_interval_memory_growth.fetch_add(size); + return true; + } + } + void sub_wg_refresh_interval_memory_growth(int64_t size) { + _wg_refresh_interval_memory_growth.fetch_sub(size); + } void check_mem_used(bool* is_low_wartermark, bool* is_high_wartermark) const { - auto weighted_mem_used = _weighted_mem_used.load(std::memory_order_relaxed); - *is_low_wartermark = - (weighted_mem_used > ((double)_memory_limit * - _spill_low_watermark.load(std::memory_order_relaxed) / 100)); - *is_high_wartermark = - (weighted_mem_used > ((double)_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100)); + auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); + *is_low_wartermark = (realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_low_watermark.load(std::memory_order_relaxed) / 100)); + *is_high_wartermark = (realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_high_watermark.load(std::memory_order_relaxed) / 100)); } std::string debug_string() const; @@ -137,7 +163,7 @@ class WorkloadGroup : public std::enable_shared_from_this { bool can_be_dropped() { std::shared_lock r_lock(_mutex); - return _is_shutdown && _query_ctxs.size() == 0; + return _is_shutdown && _query_ctxs.empty(); } int query_num() { @@ -163,15 +189,22 @@ class WorkloadGroup : public std::enable_shared_from_this { std::string thread_debug_info(); + std::shared_ptr get_scan_io_throttle(const std::string& disk_dir); + + void upsert_scan_io_throttle(WorkloadGroupInfo* tg_info); + private: mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit const uint64_t _id; std::string _name; int64_t _version; int64_t _memory_limit; // bytes - // `_weighted_mem_used` is a rough memory usage in this group, - // because we can only get a precise memory usage by MemTracker which is not include page cache. - std::atomic_int64_t _weighted_mem_used = 0; // bytes + // `weighted_memory_limit` less than or equal to _memory_limit, calculate after exclude public memory. + // more detailed description in `refresh_wg_weighted_memory_limit`. + std::atomic _weighted_memory_limit {0}; // + // last value of make_memory_tracker_snapshots, refresh every time make_memory_tracker_snapshots is called. + std::atomic_int64_t _total_mem_used = 0; // bytes + std::atomic_int64_t _wg_refresh_interval_memory_growth; bool _enable_memory_overcommit; std::atomic _cpu_share; std::vector _mem_tracker_limiter_pool; @@ -181,6 +214,8 @@ class WorkloadGroup : public std::enable_shared_from_this { std::atomic _min_remote_scan_thread_num; std::atomic _spill_low_watermark; std::atomic _spill_high_watermark; + std::atomic _scan_bytes_per_second {-1}; + std::atomic _remote_scan_bytes_per_second {-1}; // means workload group is mark dropped // new query can not submit @@ -194,6 +229,9 @@ class WorkloadGroup : public std::enable_shared_from_this { std::unique_ptr _scan_task_sched {nullptr}; std::unique_ptr _remote_scan_task_sched {nullptr}; std::unique_ptr _memtable_flush_pool {nullptr}; + + std::map> _scan_io_throttle_map; + std::shared_ptr _remote_scan_io_throttle {nullptr}; }; using WorkloadGroupPtr = std::shared_ptr; @@ -212,6 +250,8 @@ struct WorkloadGroupInfo { const int min_remote_scan_thread_num = 0; const int spill_low_watermark = 0; const int spill_high_watermark = 0; + const int read_bytes_per_second = -1; + const int remote_read_bytes_per_second = -1; // log cgroup cpu info uint64_t cgroup_cpu_shares = 0; int cgroup_cpu_hard_limit = 0; diff --git a/be/src/runtime/workload_group/workload_group_manager.cpp b/be/src/runtime/workload_group/workload_group_manager.cpp index 6813bfd3b75130..6a196497e724a8 100644 --- a/be/src/runtime/workload_group/workload_group_manager.cpp +++ b/be/src/runtime/workload_group/workload_group_manager.cpp @@ -149,117 +149,110 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i struct WorkloadGroupMemInfo { int64_t total_mem_used = 0; - int64_t weighted_mem_used = 0; - bool is_low_wartermark = false; - bool is_high_wartermark = false; - double mem_used_ratio = 0; + std::list> tracker_snapshots = + std::list>(); }; -void WorkloadGroupMgr::refresh_wg_memory_info() { - std::shared_lock r_lock(_group_mutex); - // workload group id -> workload group queries - std::unordered_map>> - all_wg_queries; - for (auto& [wg_id, wg] : _workload_groups) { - all_wg_queries.insert({wg_id, wg->queries()}); - } - int64_t all_queries_mem_used = 0; +void WorkloadGroupMgr::refresh_wg_weighted_memory_limit() { + std::shared_lock r_lock(_group_mutex); - // calculate total memory used of each workload group and total memory used of all queries + // 1. make all workload groups memory snapshots(refresh workload groups total memory used at the same time) + // and calculate total memory used of all queries. + int64_t all_workload_groups_mem_usage = 0; std::unordered_map wgs_mem_info; - for (auto& [wg_id, wg_queries] : all_wg_queries) { - int64_t wg_total_mem_used = 0; - for (const auto& [query_id, query_ctx_ptr] : wg_queries) { - if (auto query_ctx = query_ctx_ptr.lock()) { - wg_total_mem_used += query_ctx->query_mem_tracker->consumption(); - } - } - all_queries_mem_used += wg_total_mem_used; - wgs_mem_info[wg_id] = {wg_total_mem_used}; + for (auto& [wg_id, wg] : _workload_groups) { + wgs_mem_info[wg_id].total_mem_used = + wg->make_memory_tracker_snapshots(&wgs_mem_info[wg_id].tracker_snapshots); + all_workload_groups_mem_usage += wgs_mem_info[wg_id].total_mem_used; } - - auto process_memory_usage = GlobalMemoryArbitrator::process_memory_usage(); - if (all_queries_mem_used <= 0) { + if (all_workload_groups_mem_usage <= 0) { return; } - all_queries_mem_used = std::min(process_memory_usage, all_queries_mem_used); - - // process memory used is actually bigger than all_queries_mem_used, - // because memory of page cache, allocator cache, segment cache etc. are included - // in proc_vm_rss. - // we count these cache memories equally on workload groups. - double ratio = (double)process_memory_usage / (double)all_queries_mem_used; - if (ratio <= 1.25) { - std::string debug_msg = - fmt::format("\nProcess Memory Summary: {}, {}, all quries mem: {}", - doris::GlobalMemoryArbitrator::process_memory_used_details_str(), - doris::GlobalMemoryArbitrator::sys_mem_available_details_str(), - PrettyPrinter::print(all_queries_mem_used, TUnit::BYTES)); - LOG_EVERY_T(INFO, 10) << debug_msg; + // 2. calculate weighted memory limit ratio. + // when construct workload group, mem_limit is equal to (process_memory_limit * group_limit_percent), + // here, it is assumed that the available memory of workload groups is equal to process_memory_limit. + // + // but process_memory_usage is actually bigger than all_workload_groups_mem_usage, + // because public_memory of page cache, allocator cache, segment cache etc. are included in process_memory_usage. + // so actual available memory of the workload groups is equal to (process_memory_limit - public_memory) + // + // we will exclude this public_memory when calculate workload group mem_limit. + // so a ratio is calculated to multiply the workload group mem_limit from the previous construction. + auto process_memory_usage = GlobalMemoryArbitrator::process_memory_usage(); + auto process_memory_limit = MemInfo::mem_limit(); + double weighted_memory_limit_ratio = 1; + // if all_workload_groups_mem_usage is greater than process_memory_usage, it means that the memory statistics + // of the workload group are inaccurate. + // the reason is that query/load/etc. tracked is virtual memory, and virtual memory is not used in time. + // + // At this time, weighted_memory_limit_ratio is equal to 1, and workload group mem_limit is still equal to + // (process_memory_limit * group_limit_percent), this may cause query spill to occur earlier, + // However, there is no good solution at present, but we cannot predict when these virtual memory will be used. + if (all_workload_groups_mem_usage < process_memory_usage) { + int64_t public_memory = process_memory_usage - all_workload_groups_mem_usage; + weighted_memory_limit_ratio = 1 - (double)public_memory / (double)process_memory_limit; } - for (auto& wg : _workload_groups) { - auto wg_mem_limit = wg.second->memory_limit(); - auto& wg_mem_info = wgs_mem_info[wg.first]; - wg_mem_info.weighted_mem_used = int64_t(wg_mem_info.total_mem_used * ratio); - wg_mem_info.mem_used_ratio = (double)wg_mem_info.weighted_mem_used / wg_mem_limit; - - wg.second->set_weighted_memory_used(wg_mem_info.total_mem_used, ratio); - - auto spill_low_water_mark = wg.second->spill_threshold_low_water_mark(); - auto spill_high_water_mark = wg.second->spill_threashold_high_water_mark(); - wg_mem_info.is_high_wartermark = (wg_mem_info.weighted_mem_used > - ((double)wg_mem_limit * spill_high_water_mark / 100)); - wg_mem_info.is_low_wartermark = (wg_mem_info.weighted_mem_used > - ((double)wg_mem_limit * spill_low_water_mark / 100)); - - // calculate query weighted memory limit of task group - const auto& wg_queries = all_wg_queries[wg.first]; - auto wg_query_count = wg_queries.size(); - int64_t query_weighted_mem_limit = - wg_query_count ? (wg_mem_limit + wg_query_count) / wg_query_count : wg_mem_limit; + std::string debug_msg = fmt::format( + "\nProcess Memory Summary: {}, {}, all workload groups memory usage: {}, " + "weighted_memory_limit_ratio: {}", + doris::GlobalMemoryArbitrator::process_memory_used_details_str(), + doris::GlobalMemoryArbitrator::sys_mem_available_details_str(), + PrettyPrinter::print(all_workload_groups_mem_usage, TUnit::BYTES), + weighted_memory_limit_ratio); + LOG_EVERY_T(INFO, 10) << debug_msg; - std::string debug_msg; - if (wg_mem_info.is_high_wartermark || wg_mem_info.is_low_wartermark) { - debug_msg = fmt::format( - "\nWorkload Group {}: mem limit: {}, mem used: {}, weighted mem used: {}, used " - "ratio: {}, query " - "count: {}, query_weighted_mem_limit: {}", - wg.second->name(), PrettyPrinter::print(wg_mem_limit, TUnit::BYTES), - PrettyPrinter::print(wg_mem_info.total_mem_used, TUnit::BYTES), - PrettyPrinter::print(wg_mem_info.weighted_mem_used, TUnit::BYTES), - wg_mem_info.mem_used_ratio, wg_query_count, - PrettyPrinter::print(query_weighted_mem_limit, TUnit::BYTES)); + for (auto& wg : _workload_groups) { + // 3.1 calculate query spill threshold of task group + auto wg_weighted_mem_limit = + int64_t(wg.second->memory_limit() * weighted_memory_limit_ratio); + wg.second->set_weighted_memory_limit(wg_weighted_mem_limit); - debug_msg += "\n Query Memory Summary:"; - } else { - continue; - } - // check whether queries need to revoke memory for task group - for (const auto& query : wg_queries) { + // 3.2 set workload groups weighted memory limit and all query spill threshold. + auto wg_query_count = wgs_mem_info[wg.first].tracker_snapshots.size(); + int64_t query_spill_threshold = + wg_query_count ? (wg_weighted_mem_limit + wg_query_count) / wg_query_count + : wg_weighted_mem_limit; + for (const auto& query : wg.second->queries()) { auto query_ctx = query.second.lock(); if (!query_ctx) { continue; } - auto query_consumption = query_ctx->query_mem_tracker->consumption(); - auto query_weighted_consumption = int64_t(query_consumption * ratio); - query_ctx->set_weighted_mem(query_weighted_mem_limit, query_weighted_consumption); + query_ctx->set_spill_threshold(query_spill_threshold); + } - if (wg_mem_info.is_high_wartermark || wg_mem_info.is_low_wartermark) { + // 3.3 only print debug logs, if workload groups is_high_wartermark or is_low_wartermark. + bool is_low_wartermark = false; + bool is_high_wartermark = false; + wg.second->check_mem_used(&is_low_wartermark, &is_high_wartermark); + std::string debug_msg; + if (is_high_wartermark || is_low_wartermark) { + debug_msg = fmt::format( + "\nWorkload Group {}: mem limit: {}, mem used: {}, weighted mem limit: {}, " + "used " + "ratio: {}, query count: {}, query spill threshold: {}", + wg.second->name(), + PrettyPrinter::print(wg.second->memory_limit(), TUnit::BYTES), + PrettyPrinter::print(wgs_mem_info[wg.first].total_mem_used, TUnit::BYTES), + PrettyPrinter::print(wg_weighted_mem_limit, TUnit::BYTES), + (double)wgs_mem_info[wg.first].total_mem_used / wg_weighted_mem_limit, + wg_query_count, PrettyPrinter::print(query_spill_threshold, TUnit::BYTES)); + + debug_msg += "\n Query Memory Summary:"; + // check whether queries need to revoke memory for task group + for (const auto& query_mem_tracker : wgs_mem_info[wg.first].tracker_snapshots) { debug_msg += fmt::format( - "\n MemTracker Label={}, Parent Label={}, Used={}, WeightedUsed={}, " + "\n MemTracker Label={}, Parent Label={}, Used={}, SpillThreshold={}, " "Peak={}", - query_ctx->query_mem_tracker->label(), - query_ctx->query_mem_tracker->parent_label(), - PrettyPrinter::print(query_consumption, TUnit::BYTES), - PrettyPrinter::print(query_weighted_consumption, TUnit::BYTES), - PrettyPrinter::print(query_ctx->query_mem_tracker->peak_consumption(), - TUnit::BYTES)); + query_mem_tracker->label(), query_mem_tracker->parent_label(), + PrettyPrinter::print(query_mem_tracker->consumption(), TUnit::BYTES), + PrettyPrinter::print(query_spill_threshold, TUnit::BYTES), + PrettyPrinter::print(query_mem_tracker->peak_consumption(), TUnit::BYTES)); } - } - if (wg_mem_info.is_high_wartermark || wg_mem_info.is_low_wartermark) { LOG_EVERY_T(INFO, 1) << debug_msg; + } else { + continue; } } } diff --git a/be/src/runtime/workload_group/workload_group_manager.h b/be/src/runtime/workload_group/workload_group_manager.h index 8aeb8f988a30df..f7f02bf63e6997 100644 --- a/be/src/runtime/workload_group/workload_group_manager.h +++ b/be/src/runtime/workload_group/workload_group_manager.h @@ -54,7 +54,7 @@ class WorkloadGroupMgr { bool enable_cpu_hard_limit() { return _enable_cpu_hard_limit.load(); } - void refresh_wg_memory_info(); + void refresh_wg_weighted_memory_limit(); private: std::shared_mutex _group_mutex; diff --git a/be/src/runtime/workload_management/io_throttle.cpp b/be/src/runtime/workload_management/io_throttle.cpp new file mode 100644 index 00000000000000..3a8256eee3746d --- /dev/null +++ b/be/src/runtime/workload_management/io_throttle.cpp @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/workload_management/io_throttle.h" + +#include "util/time.h" + +namespace doris { + +bool IOThrottle::acquire(int64_t block_timeout_ms) { + if (_io_bytes_per_second < 0) { + return true; + } + + std::unique_lock w_lock(_mutex); + int64_t current_time = GetCurrentTimeMicros(); + int64_t block_finish_time = block_timeout_ms <= 0 ? 0 : current_time + block_timeout_ms * 1000; + + while (current_time <= _next_io_time_micros) { + if (block_finish_time > 0 && current_time >= block_finish_time) { + return false; + } + wait_condition.wait_for(w_lock, + std::chrono::microseconds(_next_io_time_micros - current_time)); + current_time = GetCurrentTimeMicros(); + } + return true; +} + +bool IOThrottle::try_acquire() { + if (_io_bytes_per_second < 0) { + return true; + } + std::unique_lock w_lock(_mutex); + return GetCurrentTimeMicros() > _next_io_time_micros; +} + +void IOThrottle::update_next_io_time(int64_t io_bytes) { + if (_io_bytes_per_second <= 0 || io_bytes <= 0) { + return; + } + int64_t read_bytes_per_second = _io_bytes_per_second; + std::unique_lock w_lock(_mutex); + double io_bytes_float = static_cast(io_bytes); + double ret = (io_bytes_float / static_cast(read_bytes_per_second)) * + static_cast(MICROS_PER_SEC); + int64_t current_time = GetCurrentTimeMicros(); + + if (current_time > _next_io_time_micros) { + _next_io_time_micros = current_time; + } + _next_io_time_micros += ret < 1 ? static_cast(1) : static_cast(ret); +} + +void IOThrottle::set_io_bytes_per_second(int64_t io_bytes_per_second) { + _io_bytes_per_second = io_bytes_per_second; +} + +}; // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_management/io_throttle.h b/be/src/runtime/workload_management/io_throttle.h new file mode 100644 index 00000000000000..691255d23c48c4 --- /dev/null +++ b/be/src/runtime/workload_management/io_throttle.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include + +namespace doris { + +class IOThrottle; + +struct IOThrottleCtx { + IOThrottle* io_throttle = nullptr; + int io_block_timeout; +}; + +class IOThrottle { +public: + IOThrottle() = default; + + ~IOThrottle() = default; + + bool acquire(int64_t block_timeout_ms); + + // non-block acquire + bool try_acquire(); + + void update_next_io_time(int64_t bytes); + + void set_io_bytes_per_second(int64_t read_bytes_per_second); + + int64_t get_io_bytes_per_second() { return _io_bytes_per_second; } + +private: + std::mutex _mutex; + std::condition_variable wait_condition; + int64_t _next_io_time_micros {0}; + std::atomic _io_bytes_per_second {10485760}; +}; +}; // namespace doris \ No newline at end of file diff --git a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp index d2b45a0b77c6ef..a07e479d759be7 100644 --- a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp +++ b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp @@ -43,7 +43,7 @@ arrow::Result> ArrowFlightBatchReader::C auto schema = ExecEnv::GetInstance()->result_mgr()->find_arrow_schema(statement_->query_id); if (schema == nullptr) { ARROW_RETURN_NOT_OK(arrow::Status::Invalid(fmt::format( - "not found arrow flight schema, maybe query has been canceled, queryid: {}", + "Client not found arrow flight schema, maybe query has been canceled, queryid: {}", print_id(statement_->query_id)))); } std::shared_ptr result(new ArrowFlightBatchReader(statement_, schema)); diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index 1d85a2bca698dd..9b63439a63425a 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -923,27 +923,8 @@ void BaseBackendService::close_scanner(TScanCloseResult& result_, const TScanClo void BackendService::get_stream_load_record(TStreamLoadRecordResult& result, int64_t last_stream_record_time) { - auto stream_load_recorder = _engine.get_stream_load_recorder(); - if (stream_load_recorder != nullptr) { - std::map records; - auto st = stream_load_recorder->get_batch(std::to_string(last_stream_record_time), - config::stream_load_record_batch_size, &records); - if (st.ok()) { - LOG(INFO) << "get_batch stream_load_record rocksdb successfully. records size: " - << records.size() - << ", last_stream_load_timestamp: " << last_stream_record_time; - std::map stream_load_record_batch; - auto it = records.begin(); - for (; it != records.end(); ++it) { - TStreamLoadRecord stream_load_item; - StreamLoadContext::parse_stream_load_record(it->second, stream_load_item); - stream_load_record_batch.emplace(it->first.c_str(), stream_load_item); - } - result.__set_stream_load_record(stream_load_record_batch); - } - } else { - LOG(WARNING) << "stream_load_recorder is null."; - } + BaseBackendService::get_stream_load_record(result, last_stream_record_time, + _engine.get_stream_load_recorder()); } void BackendService::check_storage_format(TCheckStorageFormatResult& result) { @@ -1199,6 +1180,31 @@ void BaseBackendService::get_stream_load_record(TStreamLoadRecordResult& result, LOG(ERROR) << "get_stream_load_record is not implemented"; } +void BaseBackendService::get_stream_load_record( + TStreamLoadRecordResult& result, int64_t last_stream_record_time, + std::shared_ptr stream_load_recorder) { + if (stream_load_recorder != nullptr) { + std::map records; + auto st = stream_load_recorder->get_batch(std::to_string(last_stream_record_time), + config::stream_load_record_batch_size, &records); + if (st.ok()) { + LOG(INFO) << "get_batch stream_load_record rocksdb successfully. records size: " + << records.size() + << ", last_stream_load_timestamp: " << last_stream_record_time; + std::map stream_load_record_batch; + auto it = records.begin(); + for (; it != records.end(); ++it) { + TStreamLoadRecord stream_load_item; + StreamLoadContext::parse_stream_load_record(it->second, stream_load_item); + stream_load_record_batch.emplace(it->first.c_str(), stream_load_item); + } + result.__set_stream_load_record(stream_load_record_batch); + } + } else { + LOG(WARNING) << "stream_load_recorder is null."; + } +} + void BaseBackendService::get_disk_trash_used_capacity(std::vector& diskTrashInfos) { LOG(ERROR) << "get_disk_trash_used_capacity is not implemented"; } diff --git a/be/src/service/backend_service.h b/be/src/service/backend_service.h index 4a04f16e853dd6..4d01107ba8a832 100644 --- a/be/src/service/backend_service.h +++ b/be/src/service/backend_service.h @@ -26,6 +26,7 @@ #include "agent/agent_server.h" #include "agent/topic_subscriber.h" #include "common/status.h" +#include "runtime/stream_load/stream_load_recorder.h" namespace doris { @@ -162,6 +163,9 @@ class BaseBackendService : public BackendServiceIf { protected: Status start_plan_fragment_execution(const TExecPlanFragmentParams& exec_params); + void get_stream_load_record(TStreamLoadRecordResult& result, int64_t last_stream_record_time, + std::shared_ptr stream_load_recorder); + ExecEnv* _exec_env = nullptr; std::unique_ptr _agent_server; std::unique_ptr _ingest_binlog_workers; diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 92d3452dcb136d..8ca86d4c5750e4 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -67,6 +67,7 @@ #include "common/signal_handler.h" #include "common/status.h" #include "io/cache/block_file_cache_factory.h" +#include "io/fs/local_file_reader.h" #include "olap/options.h" #include "olap/storage_engine.h" #include "runtime/exec_env.h" @@ -527,6 +528,7 @@ int main(int argc, char** argv) { doris::ThreadLocalHandle::create_thread_local_if_not_exits(); + doris::io::BeConfDataDirReader::init_be_conf_data_dir(paths, spill_paths); // init exec env auto* exec_env(doris::ExecEnv::GetInstance()); status = doris::ExecEnv::init(doris::ExecEnv::GetInstance(), paths, spill_paths, broken_paths); diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 0be4dbff832c94..9522f23e3bd70e 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -36,7 +36,6 @@ #include "http/action/check_tablet_segment_action.h" #include "http/action/checksum_action.h" #include "http/action/clear_cache_action.h" -#include "http/action/clear_file_cache_action.h" #include "http/action/compaction_action.h" #include "http/action/config_action.h" #include "http/action/debug_point_action.h" @@ -57,6 +56,7 @@ #include "http/action/reset_rpc_channel_action.h" #include "http/action/restore_tablet_action.h" #include "http/action/show_hotspot_action.h" +#include "http/action/show_nested_index_file_action.h" #include "http/action/shrink_mem_action.h" #include "http/action/snapshot_action.h" #include "http/action/stream_load.h" @@ -156,10 +156,10 @@ Status HttpService::start() { HealthAction* health_action = _pool.add(new HealthAction()); _ev_http_server->register_handler(HttpMethod::GET, "/api/health", health_action); - // Dump all running pipeline tasks - ClearDataCacheAction* clear_data_cache_action = _pool.add(new ClearDataCacheAction()); - _ev_http_server->register_handler(HttpMethod::GET, "/api/clear_data_cache", - clear_data_cache_action); + // Clear cache action + ClearCacheAction* clear_cache_action = _pool.add(new ClearCacheAction()); + _ev_http_server->register_handler(HttpMethod::GET, "/api/clear_cache/{type}", + clear_cache_action); // Dump all running pipeline tasks PipelineTaskAction* pipeline_task_action = _pool.add(new PipelineTaskAction()); @@ -208,9 +208,6 @@ Status HttpService::start() { _pool.add(new MetaAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/meta/{op}/{tablet_id}", meta_action); - FileCacheAction* file_cache_action = _pool.add(new FileCacheAction()); - _ev_http_server->register_handler(HttpMethod::GET, "/api/file_cache", file_cache_action); - ConfigAction* update_config_action = _pool.add(new ConfigAction(ConfigActionType::UPDATE_CONFIG)); _ev_http_server->register_handler(HttpMethod::POST, "/api/update_config", update_config_action); @@ -303,9 +300,8 @@ void HttpService::register_local_handler(StorageEngine& engine) { _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_binlog/_download", download_binlog_action); - ClearFileCacheAction* clear_file_cache_action = _pool.add(new ClearFileCacheAction()); - _ev_http_server->register_handler(HttpMethod::POST, "/api/clear_file_cache", - clear_file_cache_action); + FileCacheAction* file_cache_action = _pool.add(new FileCacheAction()); + _ev_http_server->register_handler(HttpMethod::POST, "/api/file_cache", file_cache_action); TabletsDistributionAction* tablets_distribution_action = _pool.add(new TabletsDistributionAction(_env, engine, TPrivilegeHier::GLOBAL, @@ -377,6 +373,11 @@ void HttpService::register_local_handler(StorageEngine& engine) { CalcFileCrcAction* calc_crc_action = _pool.add( new CalcFileCrcAction(_env, engine, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/calc_crc", calc_crc_action); + + ShowNestedIndexFileAction* show_nested_index_file_action = _pool.add( + new ShowNestedIndexFileAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/show_nested_index_file", + show_nested_index_file_action); } void HttpService::register_cloud_handler(CloudStorageEngine& engine) { @@ -400,11 +401,19 @@ void HttpService::register_cloud_handler(CloudStorageEngine& engine) { _ev_http_server->register_handler(HttpMethod::GET, "/api/injection_point/{op}", injection_point_action); #endif - ClearFileCacheAction* clear_file_cache_action = _pool.add(new ClearFileCacheAction()); - _ev_http_server->register_handler(HttpMethod::POST, "/api/clear_file_cache", - clear_file_cache_action); + FileCacheAction* file_cache_action = _pool.add(new FileCacheAction()); + _ev_http_server->register_handler(HttpMethod::GET, "/api/file_cache", file_cache_action); auto* show_hotspot_action = _pool.add(new ShowHotspotAction(engine)); _ev_http_server->register_handler(HttpMethod::GET, "/api/hotspot/tablet", show_hotspot_action); + + CalcFileCrcAction* calc_crc_action = _pool.add( + new CalcFileCrcAction(_env, engine, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/calc_crc", calc_crc_action); + + ShowNestedIndexFileAction* show_nested_index_file_action = _pool.add( + new ShowNestedIndexFileAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/show_nested_index_file", + show_nested_index_file_action); } // NOLINTEND(readability-function-size) diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 8bf04ead03551a..1fd8c681881be3 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -288,6 +288,9 @@ void PInternalService::exec_plan_fragment(google::protobuf::RpcController* contr const PExecPlanFragmentRequest* request, PExecPlanFragmentResult* response, google::protobuf::Closure* done) { + timeval tv {}; + gettimeofday(&tv, nullptr); + response->set_received_time(tv.tv_sec * 1000LL + tv.tv_usec / 1000); bool ret = _light_work_pool.try_offer([this, controller, request, response, done]() { _exec_plan_fragment_in_pthread(controller, request, response, done); }); @@ -301,6 +304,9 @@ void PInternalService::_exec_plan_fragment_in_pthread(google::protobuf::RpcContr const PExecPlanFragmentRequest* request, PExecPlanFragmentResult* response, google::protobuf::Closure* done) { + timeval tv1 {}; + gettimeofday(&tv1, nullptr); + response->set_execution_time(tv1.tv_sec * 1000LL + tv1.tv_usec / 1000); brpc::ClosureGuard closure_guard(done); auto st = Status::OK(); bool compact = request->has_compact() ? request->compact() : false; @@ -318,12 +324,18 @@ void PInternalService::_exec_plan_fragment_in_pthread(google::protobuf::RpcContr LOG(WARNING) << "exec plan fragment failed, errmsg=" << st; } st.to_protobuf(response->mutable_status()); + timeval tv2 {}; + gettimeofday(&tv2, nullptr); + response->set_execution_done_time(tv2.tv_sec * 1000LL + tv2.tv_usec / 1000); } void PInternalService::exec_plan_fragment_prepare(google::protobuf::RpcController* controller, const PExecPlanFragmentRequest* request, PExecPlanFragmentResult* response, google::protobuf::Closure* done) { + timeval tv {}; + gettimeofday(&tv, nullptr); + response->set_received_time(tv.tv_sec * 1000LL + tv.tv_usec / 1000); bool ret = _light_work_pool.try_offer([this, controller, request, response, done]() { _exec_plan_fragment_in_pthread(controller, request, response, done); }); @@ -337,10 +349,19 @@ void PInternalService::exec_plan_fragment_start(google::protobuf::RpcController* const PExecPlanFragmentStartRequest* request, PExecPlanFragmentResult* result, google::protobuf::Closure* done) { + timeval tv {}; + gettimeofday(&tv, nullptr); + result->set_received_time(tv.tv_sec * 1000LL + tv.tv_usec / 1000); bool ret = _light_work_pool.try_offer([this, request, result, done]() { + timeval tv1 {}; + gettimeofday(&tv1, nullptr); + result->set_execution_time(tv1.tv_sec * 1000LL + tv1.tv_usec / 1000); brpc::ClosureGuard closure_guard(done); auto st = _exec_env->fragment_mgr()->start_query_execution(request); st.to_protobuf(result->mutable_status()); + timeval tv2 {}; + gettimeofday(&tv2, nullptr); + result->set_execution_done_time(tv2.tv_sec * 1000LL + tv2.tv_usec / 1000); }); if (!ret) { offer_failed(result, done, _light_work_pool); @@ -833,9 +854,9 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController ExecEnv::GetInstance()->result_mgr()->find_arrow_schema( UniqueId(request->finst_id()).to_thrift()); if (schema == nullptr) { - LOG(INFO) << "not found arrow flight schema, maybe query has been canceled"; + LOG(INFO) << "FE not found arrow flight schema, maybe query has been canceled"; auto st = Status::NotFound( - "not found arrow flight schema, maybe query has been canceled"); + "FE not found arrow flight schema, maybe query has been canceled"); st.to_protobuf(result->mutable_status()); return; } @@ -1044,11 +1065,11 @@ struct AsyncRPCContext { brpc::CallId cid; }; -void PInternalServiceImpl::fetch_remote_tablet_schema(google::protobuf::RpcController* controller, - const PFetchRemoteSchemaRequest* request, - PFetchRemoteSchemaResponse* response, - google::protobuf::Closure* done) { - bool ret = _heavy_work_pool.try_offer([this, request, response, done]() { +void PInternalService::fetch_remote_tablet_schema(google::protobuf::RpcController* controller, + const PFetchRemoteSchemaRequest* request, + PFetchRemoteSchemaResponse* response, + google::protobuf::Closure* done) { + bool ret = _heavy_work_pool.try_offer([request, response, done]() { brpc::ClosureGuard closure_guard(done); Status st = Status::OK(); if (request->is_coordinator()) { @@ -1120,13 +1141,13 @@ void PInternalServiceImpl::fetch_remote_tablet_schema(google::protobuf::RpcContr if (!target_tablets.empty()) { std::vector tablet_schemas; for (int64_t tablet_id : target_tablets) { - TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(tablet_id, false); - if (tablet == nullptr) { + auto res = ExecEnv::get_tablet(tablet_id); + if (!res.has_value()) { // just ignore LOG(WARNING) << "tablet does not exist, tablet id is " << tablet_id; continue; } - tablet_schemas.push_back(tablet->tablet_schema()); + tablet_schemas.push_back(res.value()->merged_tablet_schema()); } if (!tablet_schemas.empty()) { // merge all @@ -1918,11 +1939,6 @@ void PInternalServiceImpl::_response_pull_slave_rowset(const std::string& remote pull_rowset_callback->join(); if (pull_rowset_callback->cntl_->Failed()) { - if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(stub, remote_host, - brpc_port)) { - ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( - closure->cntl_->remote_side()); - } LOG(WARNING) << "failed to response result of slave replica to master replica, error=" << berror(pull_rowset_callback->cntl_->ErrorCode()) << ", error_text=" << pull_rowset_callback->cntl_->ErrorText() @@ -2035,6 +2051,10 @@ void PInternalService::group_commit_insert(google::protobuf::RpcController* cont response->set_loaded_rows(state->num_rows_load_success()); response->set_filtered_rows(state->num_rows_load_filtered()); status->to_protobuf(response->mutable_status()); + if (!state->get_error_log_file_path().empty()) { + response->set_error_url( + to_load_error_http_path(state->get_error_log_file_path())); + } _exec_env->new_load_stream_mgr()->remove(load_id); }); } catch (const Exception& e) { @@ -2085,4 +2105,25 @@ void PInternalService::get_wal_queue_size(google::protobuf::RpcController* contr } } +void PInternalService::get_be_resource(google::protobuf::RpcController* controller, + const PGetBeResourceRequest* request, + PGetBeResourceResponse* response, + google::protobuf::Closure* done) { + bool ret = _light_work_pool.try_offer([response, done]() { + brpc::ClosureGuard closure_guard(done); + int64_t mem_limit = MemInfo::mem_limit(); + int64_t mem_usage = PerfCounters::get_vm_rss(); + + PGlobalResourceUsage* global_resource_usage = response->mutable_global_be_resource_usage(); + global_resource_usage->set_mem_limit(mem_limit); + global_resource_usage->set_mem_usage(mem_usage); + + Status st = Status::OK(); + response->mutable_status()->set_status_code(st.code()); + }); + if (!ret) { + offer_failed(response, done, _light_work_pool); + } +} + } // namespace doris diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h index 9cad429107afd7..85a31136f22e8f 100644 --- a/be/src/service/internal_service.h +++ b/be/src/service/internal_service.h @@ -225,6 +225,15 @@ class PInternalService : public PBackendService { PJdbcTestConnectionResult* result, google::protobuf::Closure* done) override; + void fetch_remote_tablet_schema(google::protobuf::RpcController* controller, + const PFetchRemoteSchemaRequest* request, + PFetchRemoteSchemaResponse* response, + google::protobuf::Closure* done) override; + + void get_be_resource(google::protobuf::RpcController* controller, + const PGetBeResourceRequest* request, PGetBeResourceResponse* response, + google::protobuf::Closure* done) override; + private: void _exec_plan_fragment_in_pthread(google::protobuf::RpcController* controller, const PExecPlanFragmentRequest* request, @@ -287,11 +296,6 @@ class PInternalServiceImpl final : public PInternalService { PGetTabletVersionsResponse* response, google::protobuf::Closure* done) override; - void fetch_remote_tablet_schema(google::protobuf::RpcController* controller, - const PFetchRemoteSchemaRequest* request, - PFetchRemoteSchemaResponse* response, - google::protobuf::Closure* done) override; - private: void _response_pull_slave_rowset(const std::string& remote_host, int64_t brpc_port, int64_t txn_id, int64_t tablet_id, int64_t node_id, diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 6934f45ef3e561..44b391f44dae34 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -104,7 +104,7 @@ class BitUtil { T value = input; for (int i = 0; i < sizeof(value); ++i) { // Applies a mask for a byte range on the input. - char value_to_save = value & 0XFF; + signed char value_to_save = value & 0XFF; buffer.push_back(value_to_save); // Remove the just processed part from the input so that we can exit early if there // is nothing left to process. diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h index ac5e826e1fee66..2d15ac99611274 100644 --- a/be/src/util/bitmap_value.h +++ b/be/src/util/bitmap_value.h @@ -1252,8 +1252,7 @@ class BitmapValue { std::vector bitmaps; std::vector single_values; std::vector*> sets; - for (int i = 0; i < values.size(); ++i) { - auto* value = values[i]; + for (const auto* value : values) { switch (value->_type) { case EMPTY: break; @@ -1280,7 +1279,9 @@ class BitmapValue { _bitmap->add(_sv); break; case BITMAP: - *_bitmap |= detail::Roaring64Map::fastunion(bitmaps.size(), bitmaps.data()); + for (const auto* bitmap : bitmaps) { + *_bitmap |= *bitmap; + } break; case SET: { *_bitmap = detail::Roaring64Map::fastunion(bitmaps.size(), bitmaps.data()); @@ -1315,6 +1316,7 @@ class BitmapValue { _bitmap->add(v); } _type = BITMAP; + _set.clear(); break; case SET: { break; diff --git a/be/src/util/brpc_client_cache.h b/be/src/util/brpc_client_cache.h index 290f2cc3e04747..ebef80f4a6bdfb 100644 --- a/be/src/util/brpc_client_cache.h +++ b/be/src/util/brpc_client_cache.h @@ -126,6 +126,7 @@ class BrpcClientCache { options.connection_group = connection_group; } options.connect_timeout_ms = 2000; + options.timeout_ms = 2000; options.max_retry = 10; std::unique_ptr channel(new brpc::Channel()); diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h index 8d9fa6d004f589..3ec0acbda01d79 100644 --- a/be/src/util/faststring.h +++ b/be/src/util/faststring.h @@ -35,7 +35,7 @@ namespace doris { // common use cases (in particular, resize() will fill with uninitialized data // instead of memsetting to \0) // only build() can transfer data to the outside. -class faststring : private Allocator { +class faststring : private Allocator { public: enum { kInitialCapacity = 32 }; diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index baddefcc27f828..d0703c985ea884 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -39,6 +39,7 @@ #include "common/config.h" #include "common/status.h" #include "gutil/strings/split.h" +#include "runtime/memory/global_memory_arbitrator.h" #include "util/cgroup_util.h" #include "util/parse_util.h" #include "util/pretty_printer.h" @@ -46,6 +47,21 @@ namespace doris { +static bvar::Adder memory_jemalloc_cache_bytes("memory_jemalloc_cache_bytes"); +static bvar::Adder memory_jemalloc_dirty_pages_bytes("memory_jemalloc_dirty_pages_bytes"); +static bvar::Adder memory_jemalloc_metadata_bytes("memory_jemalloc_metadata_bytes"); +static bvar::Adder memory_jemalloc_virtual_bytes("memory_jemalloc_virtual_bytes"); +static bvar::Adder memory_cgroup_usage_bytes("memory_cgroup_usage_bytes"); +static bvar::Adder memory_sys_available_bytes("memory_sys_available_bytes"); +static bvar::Adder memory_arbitrator_sys_available_bytes( + "memory_arbitrator_sys_available_bytes"); +static bvar::Adder memory_arbitrator_process_usage_bytes( + "memory_arbitrator_process_usage_bytes"); +static bvar::Adder memory_arbitrator_reserve_memory_bytes( + "memory_arbitrator_reserve_memory_bytes"); +static bvar::Adder memory_arbitrator_refresh_interval_growth_bytes( + "memory_arbitrator_refresh_interval_growth_bytes"); + bool MemInfo::_s_initialized = false; std::atomic MemInfo::_s_physical_mem = std::numeric_limits::max(); std::atomic MemInfo::_s_mem_limit = std::numeric_limits::max(); @@ -116,6 +132,33 @@ void MemInfo::refresh_allocator_mem() { #endif } +void MemInfo::refresh_memory_bvar() { + memory_jemalloc_cache_bytes << MemInfo::allocator_cache_mem() - + memory_jemalloc_cache_bytes.get_value(); + memory_jemalloc_dirty_pages_bytes + << MemInfo::je_dirty_pages_mem() - memory_jemalloc_dirty_pages_bytes.get_value(); + memory_jemalloc_metadata_bytes + << MemInfo::allocator_metadata_mem() - memory_jemalloc_metadata_bytes.get_value(); + memory_jemalloc_virtual_bytes << MemInfo::allocator_virtual_mem() - + memory_jemalloc_virtual_bytes.get_value(); + + memory_cgroup_usage_bytes << _s_cgroup_mem_usage - memory_cgroup_usage_bytes.get_value(); + memory_sys_available_bytes << _s_sys_mem_available - memory_sys_available_bytes.get_value(); + + memory_arbitrator_sys_available_bytes + << GlobalMemoryArbitrator::sys_mem_available() - + memory_arbitrator_sys_available_bytes.get_value(); + memory_arbitrator_process_usage_bytes + << GlobalMemoryArbitrator::process_memory_usage() - + memory_arbitrator_process_usage_bytes.get_value(); + memory_arbitrator_reserve_memory_bytes + << GlobalMemoryArbitrator::process_reserved_memory() - + memory_arbitrator_reserve_memory_bytes.get_value(); + memory_arbitrator_refresh_interval_growth_bytes + << GlobalMemoryArbitrator::refresh_interval_memory_growth - + memory_arbitrator_refresh_interval_growth_bytes.get_value(); +} + #ifndef __APPLE__ void MemInfo::refresh_proc_meminfo() { std::ifstream meminfo("/proc/meminfo", std::ios::in); diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 9335933286ec24..10d2d086801540 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -75,6 +75,8 @@ class MemInfo { static void refresh_proc_meminfo(); + static void refresh_memory_bvar(); + static inline int64_t sys_mem_available_low_water_mark() { return _s_sys_mem_available_low_water_mark; } diff --git a/be/src/util/mysql_row_buffer.cpp b/be/src/util/mysql_row_buffer.cpp index 3e4aa332cea44b..4823920508a940 100644 --- a/be/src/util/mysql_row_buffer.cpp +++ b/be/src/util/mysql_row_buffer.cpp @@ -44,6 +44,9 @@ namespace doris { static uint8_t NEXT_TWO_BYTE = 252; static uint8_t NEXT_THREE_BYTE = 253; static uint8_t NEXT_EIGHT_BYTE = 254; +// the EXTRA_RESERVE_BYTE wanner to make sure _pos pointer is always in _buf memory +// used in reserve() for allocate current buffer +static size_t EXTRA_RESERVE_BYTE = 16; // the first byte: // <= 250: length @@ -133,7 +136,7 @@ int MysqlRowBuffer::reserve(int64_t size) { return 0; } - int64_t alloc_size = std::max(need_size, _buf_size * 2); + int64_t alloc_size = std::max(need_size, _buf_size * 2) + EXTRA_RESERVE_BYTE; char* new_buf = new char[alloc_size]; size_t offset = _pos - _buf; diff --git a/be/src/util/s3_rate_limiter.cpp b/be/src/util/s3_rate_limiter.cpp deleted file mode 100644 index 44099ab1438da8..00000000000000 --- a/be/src/util/s3_rate_limiter.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/s3_rate_limiter.h" - -#include -#include - -#include "common/status.h" -#include "util/s3_util.h" -#include "util/spinlock.h" -#include "util/time.h" -#if defined(__APPLE__) -#include -#define CURRENT_TIME std::chrono::system_clock::now() -#else -#define CURRENT_TIME std::chrono::high_resolution_clock::now() -#endif - -namespace doris { -// Just 10^6. -static constexpr auto MS = 1000000UL; - -std::pair S3RateLimiter::_update_remain_token( - std::chrono::system_clock::time_point now, size_t amount) { - // Values obtained under lock to be checked after release - size_t count_value; - double tokens_value; - { - std::lock_guard lock(_mutex); - if (_max_speed) { - double delta_seconds = static_cast((now - _prev_ms).count()) / MS; - _remain_tokens = std::min(_remain_tokens + _max_speed * delta_seconds - amount, - _max_burst); - } - _count += amount; - count_value = _count; - tokens_value = _remain_tokens; - _prev_ms = now; - } - return {count_value, tokens_value}; -} - -int64_t S3RateLimiter::add(size_t amount) { - // Values obtained under lock to be checked after release - auto [count_value, tokens_value] = _update_remain_token(CURRENT_TIME, amount); - - if (_limit && count_value > _limit) { - // CK would throw exception - return -1; - } - - // Wait unless there is positive amount of remain_tokens - throttling - int64_t sleep_time_ms = 0; - if (_max_speed && tokens_value < 0) { - sleep_time_ms = static_cast(-tokens_value / _max_speed * MS); - std::this_thread::sleep_for(std::chrono::microseconds(sleep_time_ms)); - } - - return sleep_time_ms; -} -std::string to_string(S3RateLimitType type) { - switch (type) { - case S3RateLimitType::GET: - return "get"; - case S3RateLimitType::PUT: - return "put"; - default: - return std::to_string(static_cast(type)); - } -} - -S3RateLimitType string_to_s3_rate_limit_type(std::string_view value) { - if (value == "get") { - return S3RateLimitType::GET; - } else if (value == "put") { - return S3RateLimitType::PUT; - } - return S3RateLimitType::UNKNOWN; -} - -Status reset_s3_rate_limiter(S3RateLimitType type, size_t max_speed, size_t max_burst, - size_t limit) { - if (type == S3RateLimitType::UNKNOWN) { - return Status::InternalError("Unknown rate limit type"); - } - return S3ClientFactory::instance().rate_limiter(type)->reset(max_speed, max_burst, limit); -} - -S3RateLimiterHolder::S3RateLimiterHolder(S3RateLimitType type, size_t max_speed, size_t max_burst, - size_t limit) - : rate_limiter(std::make_unique(max_speed, max_burst, limit)), - rate_limit_bvar(bvar::Adder(fmt::format("{}_rate_limit_ms", to_string(type)))) { -} - -int64_t S3RateLimiterHolder::add(size_t amount) { - int64_t sleep; - { - std::shared_lock read {rate_limiter_rw_lock}; - sleep = rate_limiter->add(amount); - } - if (sleep > 0) { - rate_limit_bvar << sleep; - } - return sleep; -} - -Status S3RateLimiterHolder::reset(size_t max_speed, size_t max_burst, size_t limit) { - { - std::unique_lock write {rate_limiter_rw_lock}; - rate_limiter = std::make_unique(max_speed, max_burst, limit); - } - return Status::OK(); -} -} // namespace doris \ No newline at end of file diff --git a/be/src/util/s3_rate_limiter.h b/be/src/util/s3_rate_limiter.h deleted file mode 100644 index 6163c86d3e7cd0..00000000000000 --- a/be/src/util/s3_rate_limiter.h +++ /dev/null @@ -1,79 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include -#include -#include - -#include "common/status.h" -#include "util/spinlock.h" -namespace doris { - -enum class S3RateLimitType : size_t { - GET = 0, - PUT, - UNKNOWN, -}; -extern std::string to_string(S3RateLimitType type); -extern S3RateLimitType string_to_s3_rate_limit_type(std::string_view value); -extern Status reset_s3_rate_limiter(S3RateLimitType type, size_t max_speed, size_t max_burst, - size_t limit); -class S3RateLimiter { -public: - static const size_t default_burst_seconds = 1; - - S3RateLimiter(size_t max_speed, size_t max_burst, size_t limit) - : _max_speed(max_speed), - _max_burst(max_burst), - _limit(limit), - _remain_tokens(max_burst) {} - - // Use `amount` remain_tokens, sleeps if required or throws exception on limit overflow. - // Returns duration of sleep in nanoseconds (to distinguish sleeping on different kinds of S3RateLimiters for metrics) - int64_t add(size_t amount); - -private: - std::pair _update_remain_token(std::chrono::system_clock::time_point now, - size_t amount); - size_t _count {0}; - const size_t _max_speed {0}; // in tokens per second. which indicates the QPS - const size_t _max_burst {0}; // in tokens. which indicates the token bucket size - const uint64_t _limit {0}; // 0 - not limited. - SpinLock _mutex; - double _remain_tokens { - 0}; // Amount of remain_tokens available in token bucket. Updated in `add` method. - std::chrono::system_clock::time_point _prev_ms; // Previous `add` call time (in nanoseconds). -}; - -class S3RateLimiterHolder { -public: - S3RateLimiterHolder(S3RateLimitType type, size_t max_speed, size_t max_burst, size_t limit); - - int64_t add(size_t amount); - - Status reset(size_t max_speed, size_t max_burst, size_t limit); - -private: - std::shared_mutex rate_limiter_rw_lock; - std::unique_ptr rate_limiter; - bvar::Adder rate_limit_bvar; -}; -} // namespace doris \ No newline at end of file diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp index b358c7ff6b918b..ab291c7340c39d 100644 --- a/be/src/util/s3_util.cpp +++ b/be/src/util/s3_util.cpp @@ -29,7 +29,9 @@ #include #include +#ifdef USE_AZURE #include +#endif #include #include #include @@ -41,7 +43,9 @@ #include "common/logging.h" #include "common/status.h" #include "cpp/sync_point.h" +#ifdef USE_AZURE #include "io/fs/azure_obj_storage_client.h" +#endif #include "io/fs/obj_storage_client.h" #include "io/fs/s3_obj_storage_client.h" #include "runtime/exec_env.h" @@ -52,7 +56,8 @@ namespace doris { namespace s3_bvar { bvar::LatencyRecorder s3_get_latency("s3_get"); bvar::LatencyRecorder s3_put_latency("s3_put"); -bvar::LatencyRecorder s3_delete_latency("s3_delete"); +bvar::LatencyRecorder s3_delete_object_latency("s3_delete_object"); +bvar::LatencyRecorder s3_delete_objects_latency("s3_delete_objects"); bvar::LatencyRecorder s3_head_latency("s3_head"); bvar::LatencyRecorder s3_multi_part_upload_latency("s3_multi_part_upload"); bvar::LatencyRecorder s3_list_latency("s3_list"); @@ -63,8 +68,20 @@ bvar::LatencyRecorder s3_copy_object_latency("s3_copy_object"); namespace { -bool is_s3_conf_valid(const S3ClientConf& conf) { - return !conf.endpoint.empty() && !conf.region.empty() && !conf.ak.empty() && !conf.sk.empty(); +doris::Status is_s3_conf_valid(const S3ClientConf& conf) { + if (conf.endpoint.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty endpoint"); + } + if (conf.region.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty region"); + } + if (conf.ak.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty ak"); + } + if (conf.sk.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty sk"); + } + return Status::OK(); } // Return true is convert `str` to int successfully @@ -86,12 +103,35 @@ constexpr char S3_MAX_CONN_SIZE[] = "AWS_MAX_CONN_SIZE"; constexpr char S3_REQUEST_TIMEOUT_MS[] = "AWS_REQUEST_TIMEOUT_MS"; constexpr char S3_CONN_TIMEOUT_MS[] = "AWS_CONNECTION_TIMEOUT_MS"; +auto metric_func_factory(bvar::Adder& ns_bvar, bvar::Adder& req_num_bvar) { + return [&](int64_t ns) { + if (ns > 0) { + ns_bvar << ns; + } else { + req_num_bvar << 1; + } + }; +} + } // namespace + +bvar::Adder get_rate_limit_ns("get_rate_limit_ns"); +bvar::Adder get_rate_limit_exceed_req_num("get_rate_limit_exceed_req_num"); +bvar::Adder put_rate_limit_ns("put_rate_limit_ns"); +bvar::Adder put_rate_limit_exceed_req_num("put_rate_limit_exceed_req_num"); + S3RateLimiterHolder* S3ClientFactory::rate_limiter(S3RateLimitType type) { CHECK(type == S3RateLimitType::GET || type == S3RateLimitType::PUT) << to_string(type); return _rate_limiters[static_cast(type)].get(); } +int reset_s3_rate_limiter(S3RateLimitType type, size_t max_speed, size_t max_burst, size_t limit) { + if (type == S3RateLimitType::UNKNOWN) { + return -1; + } + return S3ClientFactory::instance().rate_limiter(type)->reset(max_speed, max_burst, limit); +} + class DorisAWSLogger final : public Aws::Utils::Logging::LogSystemInterface { public: DorisAWSLogger() : _log_level(Aws::Utils::Logging::LogLevel::Info) {} @@ -149,10 +189,19 @@ S3ClientFactory::S3ClientFactory() { }; Aws::InitAPI(_aws_options); _ca_cert_file_path = get_valid_ca_cert_path(); + _rate_limiters = { + std::make_unique( + S3RateLimitType::GET, config::s3_get_token_per_second, + config::s3_get_bucket_tokens, config::s3_get_token_limit, + metric_func_factory(get_rate_limit_ns, get_rate_limit_exceed_req_num)), + std::make_unique( + S3RateLimitType::PUT, config::s3_put_token_per_second, + config::s3_put_bucket_tokens, config::s3_put_token_limit, + metric_func_factory(put_rate_limit_ns, put_rate_limit_exceed_req_num))}; } -string S3ClientFactory::get_valid_ca_cert_path() { - vector vec_ca_file_path = doris::split(config::ca_cert_file_paths, ";"); +std::string S3ClientFactory::get_valid_ca_cert_path() { + auto vec_ca_file_path = doris::split(config::ca_cert_file_paths, ";"); auto it = vec_ca_file_path.begin(); for (; it != vec_ca_file_path.end(); ++it) { if (std::filesystem::exists(*it)) { @@ -172,7 +221,7 @@ S3ClientFactory& S3ClientFactory::instance() { } std::shared_ptr S3ClientFactory::create(const S3ClientConf& s3_conf) { - if (!is_s3_conf_valid(s3_conf)) { + if (!is_s3_conf_valid(s3_conf).ok()) { return nullptr; } @@ -199,6 +248,7 @@ std::shared_ptr S3ClientFactory::create(const S3ClientConf std::shared_ptr S3ClientFactory::_create_azure_client( const S3ClientConf& s3_conf) { +#ifdef USE_AZURE auto cred = std::make_shared(s3_conf.ak, s3_conf.sk); @@ -209,6 +259,10 @@ std::shared_ptr S3ClientFactory::_create_azure_client( auto containerClient = std::make_shared(uri, cred); LOG_INFO("create one azure client with {}", s3_conf.to_string()); return std::make_shared(std::move(containerClient)); +#else + LOG_FATAL("BE is not compiled with azure support, export BUILD_AZURE=ON before building"); + return nullptr; +#endif } std::shared_ptr S3ClientFactory::_create_s3_client( @@ -238,7 +292,7 @@ std::shared_ptr S3ClientFactory::_create_s3_client( aws_config.maxConnections = config::doris_scanner_thread_pool_thread_num; #else aws_config.maxConnections = - ExecEnv::GetInstance()->scanner_scheduler()->remote_thread_pool_max_size(); + ExecEnv::GetInstance()->scanner_scheduler()->remote_thread_pool_max_thread_num(); #endif } @@ -336,8 +390,8 @@ Status S3ClientFactory::convert_properties_to_s3_conf( s3_conf->client_conf.use_virtual_addressing = it->second != "true"; } - if (!is_s3_conf_valid(s3_conf->client_conf)) { - return Status::InvalidArgument("S3 properties are incorrect, please check properties."); + if (auto st = is_s3_conf_valid(s3_conf->client_conf); !st.ok()) { + return st; } return Status::OK(); } @@ -351,6 +405,7 @@ S3Conf S3Conf::get_s3_conf(const cloud::ObjectStoreInfoPB& info) { .region = info.region(), .ak = info.ak(), .sk = info.sk(), + .token {}, .bucket = info.bucket(), .provider = io::ObjStorageType::AWS, }, diff --git a/be/src/util/s3_util.h b/be/src/util/s3_util.h index 587140a7acca75..1a1a5ae39ca18a 100644 --- a/be/src/util/s3_util.h +++ b/be/src/util/s3_util.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -33,8 +32,8 @@ #include #include "common/status.h" +#include "cpp/s3_rate_limiter.h" #include "io/fs/obj_storage_client.h" -#include "util/s3_rate_limiter.h" #include "vec/common/string_ref.h" namespace Aws::S3 { @@ -51,7 +50,8 @@ namespace doris { namespace s3_bvar { extern bvar::LatencyRecorder s3_get_latency; extern bvar::LatencyRecorder s3_put_latency; -extern bvar::LatencyRecorder s3_delete_latency; +extern bvar::LatencyRecorder s3_delete_object_latency; +extern bvar::LatencyRecorder s3_delete_objects_latency; extern bvar::LatencyRecorder s3_head_latency; extern bvar::LatencyRecorder s3_multi_part_upload_latency; extern bvar::LatencyRecorder s3_list_latency; @@ -62,25 +62,6 @@ extern bvar::LatencyRecorder s3_copy_object_latency; class S3URI; -inline ::Aws::Client::AWSError<::Aws::S3::S3Errors> s3_error_factory() { - return {::Aws::S3::S3Errors::INTERNAL_FAILURE, "exceeds limit", "exceeds limit", false}; -} - -#define DO_S3_RATE_LIMIT(op, code) \ - [&]() mutable { \ - if (!config::enable_s3_rate_limiter) { \ - return (code); \ - } \ - auto sleep_duration = S3ClientFactory::instance().rate_limiter(op)->add(1); \ - if (sleep_duration < 0) { \ - using T = decltype((code)); \ - return T(s3_error_factory()); \ - } \ - return (code); \ - }() - -#define DO_S3_GET_RATE_LIMIT(code) DO_S3_RATE_LIMIT(S3RateLimitType::GET, code) - struct S3ClientConf { std::string endpoint; std::string region; diff --git a/be/src/util/slice.h b/be/src/util/slice.h index 80f9616f3da2bd..bae33d4ee75010 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -340,7 +340,7 @@ struct SliceMap { // // only receive the memory allocated by Allocator and disables mmap, // otherwise the memory may not be freed correctly, currently only be constructed by faststring. -class OwnedSlice : private Allocator { +class OwnedSlice : private Allocator { public: OwnedSlice() : _slice((uint8_t*)nullptr, 0) {} diff --git a/be/src/util/stack_util.cpp b/be/src/util/stack_util.cpp index 5dfde1bd454dc4..20daea588732f2 100644 --- a/be/src/util/stack_util.cpp +++ b/be/src/util/stack_util.cpp @@ -36,7 +36,9 @@ void DumpStackTraceToString(std::string* stacktrace); namespace doris { std::string get_stack_trace(int start_pointers_index, std::string dwarf_location_info_mode) { -#ifdef ENABLE_STACKTRACE + if (!config::enable_stacktrace) { + return "no enable stacktrace"; + } if (dwarf_location_info_mode.empty()) { dwarf_location_info_mode = config::dwarf_location_info_mode; } @@ -55,8 +57,6 @@ std::string get_stack_trace(int start_pointers_index, std::string dwarf_location } else { return "no stack"; } -#endif - return "no enable stack"; } std::string get_stack_trace_by_glog() { diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index 082f27e7318345..e9d7ff37dbc6e8 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,6 +43,8 @@ class AggregateFunctionBitmapCount; template class AggregateFunctionBitmapOp; struct AggregateFunctionBitmapUnionOp; +class IAggregateFunction; +using AggregateFunctionPtr = std::shared_ptr; using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; @@ -178,11 +180,6 @@ class IAggregateFunction { const size_t offset, IColumn& to, const size_t num_rows) const = 0; - /** Returns true for aggregate functions of type -State. - * They are executed as other aggregate functions, but not finalized (return an aggregation state that can be combined with another). - */ - virtual bool is_state() const { return false; } - /** Contains a loop with calls to "add" function. You can collect arguments into array "places" * and do a single call to "add_batch" for devirtualization and inlining. */ @@ -223,6 +220,8 @@ class IAggregateFunction { virtual void set_version(const int version_) { version = version_; } + virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + protected: DataTypes argument_types; int version {}; @@ -519,8 +518,6 @@ class IAggregateFunctionDataHelper : public IAggregateFunctionHelper { } }; -using AggregateFunctionPtr = std::shared_ptr; - class AggregateFunctionGuard { public: using AggregateData = std::remove_pointer_t; diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h index 93a5103ef593c0..5d4a3dde3550a1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h @@ -14,13 +14,15 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/AggregateFunctionCount.h -// and modified by Doris #pragma once +#include +#include +#include + #include +#include #include "common/logging.h" #include "vec/aggregate_functions/aggregate_function.h" @@ -28,10 +30,45 @@ #include "vec/common/assert_cast.h" #include "vec/data_types/data_type_number.h" #include "vec/io/io_helper.h" -#include "vec/utils/count_by_enum_helpers.hpp" namespace doris::vectorized { +struct CountByEnumData { + std::unordered_map cbe; + uint64_t not_null = 0; + uint64_t null = 0; + uint64_t all = 0; +}; + +void build_json_from_vec(rapidjson::StringBuffer& buffer, + const std::vector& data_vec) { + rapidjson::Document doc; + doc.SetArray(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + + int vec_size_number = data_vec.size(); + for (int idx = 0; idx < vec_size_number; ++idx) { + rapidjson::Value obj(rapidjson::kObjectType); + + rapidjson::Value obj_cbe(rapidjson::kObjectType); + std::unordered_map unordered_map = data_vec[idx].cbe; + for (auto it : unordered_map) { + rapidjson::Value key_cbe(it.first.c_str(), allocator); + rapidjson::Value value_cbe(it.second); + obj_cbe.AddMember(key_cbe, value_cbe, allocator); + } + obj.AddMember("cbe", obj_cbe, allocator); + obj.AddMember("notnull", data_vec[idx].not_null, allocator); + obj.AddMember("null", data_vec[idx].null, allocator); + obj.AddMember("all", data_vec[idx].all, allocator); + + doc.PushBack(obj, allocator); + } + + rapidjson::Writer writer(buffer); + doc.Accept(writer); +} + struct AggregateFunctionCountByEnumData { using MapType = std::unordered_map; diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp index 5b2269a27d9a0b..f86d44b7d6828b 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp @@ -29,14 +29,25 @@ namespace doris::vectorized { +template +struct Reducer { + template + using Output = AggregateFunctionDistinctSingleNumericData; + using AggregateFunctionDistinctNormal = AggregateFunctionDistinct; +}; + +template +using AggregateFunctionDistinctNumeric = Reducer::AggregateFunctionDistinctNormal; + class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombinator { public: String get_name() const override { return "Distinct"; } DataTypes transform_arguments(const DataTypes& arguments) const override { if (arguments.empty()) { - LOG(FATAL) - << "Incorrect number of arguments for aggregate function with Distinct suffix"; + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, + "Incorrect number of arguments for aggregate function with Distinct suffix"); } return arguments; } @@ -51,22 +62,15 @@ class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombi if (arguments.size() == 1) { AggregateFunctionPtr res( - creator_with_numeric_type::create( + creator_with_numeric_type::create( arguments, result_is_nullable, nested_function)); if (res) { return res; } - if (arguments[0]->is_value_unambiguously_represented_in_contiguous_memory_region()) { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } else { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } + res = creator_without_type::create< + AggregateFunctionDistinct>( + arguments, result_is_nullable, nested_function); return res; } return creator_without_type::create< diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_distinct.h index c0c7a5b66dd58f..4f42e8509f2acc 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include "vec/aggregate_functions/aggregate_function.h" @@ -54,105 +56,170 @@ struct DefaultHash; namespace doris::vectorized { -template +template struct AggregateFunctionDistinctSingleNumericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory, 4>; - using Self = AggregateFunctionDistinctSingleNumericData; - Set set; + using Container = std::conditional_t, + HashSetWithStackMemory, 4>>; + using Self = AggregateFunctionDistinctSingleNumericData; + Container data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena*) { const auto& vec = assert_cast&>(*columns[0]).get_data(); - set.insert(vec[row_num]); + if constexpr (stable) { + data.emplace(vec[row_num], data.size()); + } else { + data.insert(vec[row_num]); + } } - void merge(const Self& rhs, Arena*) { set.merge(rhs.set); } + void merge(const Self& rhs, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.merge(rhs.data); + } + } - void serialize(BufferWritable& buf) const { set.write(buf); } + void serialize(BufferWritable& buf) const { + DCHECK(!stable); + if constexpr (!stable) { + data.write(buf); + } + } - void deserialize(BufferReadable& buf, Arena*) { set.read(buf); } + void deserialize(BufferReadable& buf, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.read(buf); + } + } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert(elem.get_value()); + + if constexpr (stable) { + argument_columns[0]->resize(data.size()); + auto ptr = (T*)const_cast(argument_columns[0]->get_raw_data().data); + for (auto it : data) { + ptr[it.second] = it.first; + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert(elem.get_value()); + } } return argument_columns; } }; +template struct AggregateFunctionDistinctGenericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory; + using Container = std::conditional_t, + HashSetWithStackMemory>; using Self = AggregateFunctionDistinctGenericData; - Set set; + Container data; void merge(const Self& rhs, Arena* arena) { - Set::LookupResult it; - bool inserted; - for (const auto& elem : rhs.set) { - StringRef key = elem.get_value(); - key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + DCHECK(!stable); + if constexpr (!stable) { + typename Container::LookupResult it; + bool inserted; + for (const auto& elem : rhs.data) { + StringRef key = elem.get_value(); + key.data = arena->insert(key.data, key.size); + data.emplace(key, it, inserted); + } } } void serialize(BufferWritable& buf) const { - write_var_uint(set.size(), buf); - for (const auto& elem : set) { - write_string_binary(elem.get_value(), buf); + DCHECK(!stable); + if constexpr (!stable) { + write_var_uint(data.size(), buf); + for (const auto& elem : data) { + write_string_binary(elem.get_value(), buf); + } } } void deserialize(BufferReadable& buf, Arena* arena) { - UInt64 size; - read_var_uint(size, buf); - - StringRef ref; - for (size_t i = 0; i < size; ++i) { - read_string_binary(ref, buf); - set.insert(ref); + DCHECK(!stable); + if constexpr (!stable) { + UInt64 size; + read_var_uint(size, buf); + + StringRef ref; + for (size_t i = 0; i < size; ++i) { + read_string_binary(ref, buf); + data.insert(ref); + } } } }; -template -struct AggregateFunctionDistinctSingleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctSingleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena* arena) { - Set::LookupResult it; - bool inserted; auto key = columns[0]->get_data_at(row_num); key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + argument_columns[0]->insert_data(tmp[i].data, tmp[i].size); + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + } } return argument_columns; } }; -struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctMultipleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t columns_num, size_t row_num, Arena* arena) { const char* begin = nullptr; - StringRef value(begin, 0); + StringRef key(begin, 0); for (size_t i = 0; i < columns_num; ++i) { auto cur_ref = columns[i]->serialize_value_into_arena(row_num, *arena, begin); - value.data = cur_ref.data - value.size; - value.size += cur_ref.size; + key.data = cur_ref.data - key.size; + key.size += cur_ref.size; } - Set::LookupResult it; - bool inserted; - value.data = arena->insert(value.data, value.size); - set.emplace(value, it, inserted); + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { @@ -161,10 +228,23 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi argument_columns[i] = argument_types[i]->create_column(); } - for (const auto& elem : set) { - const char* begin = elem.get_value().data; - for (auto& column : argument_columns) { - begin = column->deserialize_and_insert_from_arena(begin); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + const char* begin = tmp[i].data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } + } + } else { + for (const auto& elem : data) { + const char* begin = elem.get_value().data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } } } @@ -175,9 +255,10 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi /** Adaptor for aggregate functions. * Adding -Distinct suffix to aggregate function **/ -template +template