From 21e3e008e19c198ee81cd17c30bc02d9729f6542 Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Thu, 27 Feb 2025 20:47:47 +0800 Subject: [PATCH 1/6] [fix](cold hot separation) Fix the issue of root_path not working in HDFS resource --- be/src/agent/task_worker_pool.cpp | 6 +++--- be/src/io/fs/hdfs_file_system.cpp | 10 ++++++---- be/src/io/fs/hdfs_file_system.h | 5 +++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index f3302fc2f3e222..d2f7fc8aa2f200 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -1296,9 +1296,9 @@ void push_storage_policy_callback(StorageEngine& engine, const TAgentTaskRequest ? resource.hdfs_storage_param.root_path : ""; if (existed_resource.fs == nullptr) { - st = io::HdfsFileSystem::create(resource.hdfs_storage_param, - std::to_string(resource.id), root_path, nullptr, - &fs); + st = io::HdfsFileSystem::create( + resource.hdfs_storage_param, std::to_string(resource.id), + resource.hdfs_storage_param.fs_name, nullptr, &fs, std::move(root_path)); } else { fs = std::static_pointer_cast(existed_resource.fs); } diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 7ae16ba10f9dda..81dea703e83fc5 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -123,7 +124,7 @@ Status HdfsFileHandleCache::get_file(const std::shared_ptr& fs, Status HdfsFileSystem::create(const THdfsParams& hdfs_params, std::string id, const std::string& fs_name, RuntimeProfile* profile, - std::shared_ptr* fs) { + std::shared_ptr* fs, std::string root_path) { #ifdef USE_HADOOP_HDFS if (!config::enable_java_support) { return Status::InternalError( @@ -131,13 +132,14 @@ Status HdfsFileSystem::create(const THdfsParams& hdfs_params, std::string id, "true."); } #endif - (*fs).reset(new HdfsFileSystem(hdfs_params, std::move(id), fs_name, profile)); + (*fs).reset(new HdfsFileSystem(hdfs_params, std::move(id), fs_name, profile, std::move(root_path))); return (*fs)->connect(); } HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, std::string id, - const std::string& fs_name, RuntimeProfile* profile) - : RemoteFileSystem("", std::move(id), FileSystemType::HDFS), + const std::string& fs_name, RuntimeProfile* profile, + std::string root_path) + : RemoteFileSystem(std::move(root_path), std::move(id), FileSystemType::HDFS), _hdfs_params(hdfs_params), _fs_handle(nullptr), _profile(profile) { diff --git a/be/src/io/fs/hdfs_file_system.h b/be/src/io/fs/hdfs_file_system.h index 74d098004ab236..2d9c4ffc58c308 100644 --- a/be/src/io/fs/hdfs_file_system.h +++ b/be/src/io/fs/hdfs_file_system.h @@ -97,7 +97,8 @@ class HdfsFileHandleCache; class HdfsFileSystem final : public RemoteFileSystem { public: static Status create(const THdfsParams& hdfs_params, std::string id, const std::string& path, - RuntimeProfile* profile, std::shared_ptr* fs); + RuntimeProfile* profile, std::shared_ptr* fs, + std::string root_path = ""); ~HdfsFileSystem() override; @@ -130,7 +131,7 @@ class HdfsFileSystem final : public RemoteFileSystem { private: friend class HdfsFileWriter; HdfsFileSystem(const THdfsParams& hdfs_params, std::string id, const std::string& path, - RuntimeProfile* profile); + RuntimeProfile* profile, std::string root_path); const THdfsParams& _hdfs_params; std::string _fs_name; std::shared_ptr _fs_handle = nullptr; From 3c90db1ef97eacbf8ab846d77476dbad65cc81a6 Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Thu, 27 Feb 2025 20:51:55 +0800 Subject: [PATCH 2/6] remove unused #include --- be/src/io/fs/hdfs_file_system.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 81dea703e83fc5..f8469e4636bd13 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include From bdc5793c4c256141ac6b71774e86eb77233a64a1 Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Thu, 27 Feb 2025 20:57:08 +0800 Subject: [PATCH 3/6] clang format --- be/src/io/fs/hdfs_file_system.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index f8469e4636bd13..4eb768f2a0dee2 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -131,7 +131,8 @@ Status HdfsFileSystem::create(const THdfsParams& hdfs_params, std::string id, "true."); } #endif - (*fs).reset(new HdfsFileSystem(hdfs_params, std::move(id), fs_name, profile, std::move(root_path))); + (*fs).reset( + new HdfsFileSystem(hdfs_params, std::move(id), fs_name, profile, std::move(root_path))); return (*fs)->connect(); } From 1a08b9dd8f31cf306d24e0101c77fc2167e97b90 Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Thu, 27 Feb 2025 23:05:01 +0800 Subject: [PATCH 4/6] add a config --- be/src/common/config.cpp | 2 ++ be/src/common/config.h | 5 +++++ be/src/io/fs/hdfs_file_system.cpp | 3 ++- regression-test/pipeline/p0/conf/be.conf | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 1f81e646045b9f..eca1df34b3cb66 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1373,6 +1373,8 @@ DEFINE_String(test_s3_prefix, "prefix"); #endif // clang-format on +DEFINE_mBool(enable_root_path_of_hdfs_resource, "false"); + std::map* Register::_s_field_map = nullptr; std::map>* RegisterConfValidator::_s_field_validator = nullptr; std::map* full_conf_map = nullptr; diff --git a/be/src/common/config.h b/be/src/common/config.h index afad2cac7e73e1..23aa2a9a8d9ad7 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1432,6 +1432,11 @@ DECLARE_mInt32(compaction_num_per_round); // Enable sleep 5s between delete cumulative compaction. DECLARE_mBool(enable_sleep_between_delete_cumu_compaction); +// Because the root_path for the HDFS resource was previously passed an empty string, +// which was incorrect, this configuration has been added to ensure compatibility +// and guarantee that the root_path works as expected. +DECLARE_mBool(enable_root_path_of_hdfs_resource); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 4eb768f2a0dee2..f90d406196ff2d 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -139,7 +139,8 @@ Status HdfsFileSystem::create(const THdfsParams& hdfs_params, std::string id, HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, std::string id, const std::string& fs_name, RuntimeProfile* profile, std::string root_path) - : RemoteFileSystem(std::move(root_path), std::move(id), FileSystemType::HDFS), + : RemoteFileSystem(config::enable_root_path_of_hdfs_resource ? std::move(root_path) : "", + std::move(id), FileSystemType::HDFS), _hdfs_params(hdfs_params), _fs_handle(nullptr), _profile(profile) { diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf index 486e01c3bd2ef1..1f3ca021a724f2 100644 --- a/regression-test/pipeline/p0/conf/be.conf +++ b/regression-test/pipeline/p0/conf/be.conf @@ -94,3 +94,5 @@ enable_brpc_connection_check=true # This feature has bug, so by default is false, only open it in pipeline to observe enable_parquet_page_index=true + +enable_root_path_of_hdfs_resource=true From 7a52b802aa9b3d917fe0750d21c029f58d55be82 Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Mon, 24 Mar 2025 14:58:45 +0800 Subject: [PATCH 5/6] make the configuration parameter enable_root_path_of_hdfs_resource non-dynamically modifiable --- be/src/common/config.cpp | 2 +- be/src/common/config.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index d7b75efaf599d5..07f40f582efe8c 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1381,7 +1381,7 @@ DEFINE_String(test_s3_prefix, "prefix"); #endif // clang-format on -DEFINE_mBool(enable_root_path_of_hdfs_resource, "false"); +DEFINE_Bool(enable_root_path_of_hdfs_resource, "false"); std::map* Register::_s_field_map = nullptr; std::map>* RegisterConfValidator::_s_field_validator = nullptr; diff --git a/be/src/common/config.h b/be/src/common/config.h index 284ae1bf963479..282b2e4db8a93f 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1439,7 +1439,7 @@ DECLARE_mBool(enable_sleep_between_delete_cumu_compaction); // Because the root_path for the HDFS resource was previously passed an empty string, // which was incorrect, this configuration has been added to ensure compatibility // and guarantee that the root_path works as expected. -DECLARE_mBool(enable_root_path_of_hdfs_resource); +DECLARE_Bool(enable_root_path_of_hdfs_resource); // whether to prune rows with delete sign = 1 in base compaction // ATTN: this config is only for test From 8a8cef0e854a891ea2f9e3fcdeeaaf78c4cb076a Mon Sep 17 00:00:00 2001 From: yagagagaga Date: Thu, 27 Feb 2025 20:47:47 +0800 Subject: [PATCH 6/6] pick #48452 --- .../doris/regression/suite/Suite.groovy | 12 ++ .../cold_data_compaction_by_hdfs.groovy | 129 ++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 regression-test/suites/cold_heat_separation/cold_data_compaction_by_hdfs.groovy diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy index ded07e898ad778..eca9a19db31349 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy @@ -49,6 +49,7 @@ import org.apache.doris.regression.util.JdbcUtils import org.apache.doris.regression.util.Hdfs import org.apache.doris.regression.util.SuiteUtils import org.apache.doris.regression.util.DebugPoint +import org.apache.hadoop.fs.FileSystem import org.junit.jupiter.api.Assertions import org.slf4j.Logger @@ -92,6 +93,7 @@ class Suite implements GroovyInterceptable { final List lazyCheckFutures = new Vector<>() private AmazonS3 s3Client = null + private FileSystem fs = null Suite(String name, String group, SuiteContext context, SuiteCluster cluster) { this.name = name @@ -843,6 +845,16 @@ class Suite implements GroovyInterceptable { return enableHdfs.equals("true"); } + synchronized FileSystem getHdfs() { + if (fs == null) { + String hdfsFs = context.config.otherConfigs.get("hdfsFs") + String hdfsUser = context.config.otherConfigs.get("hdfsUser") + Hdfs hdfs = new Hdfs(hdfsFs, hdfsUser, context.config.dataPath + "/") + fs = hdfs.fs + } + return fs + } + String uploadToHdfs(String localFile) { // as group can be rewrite the origin data file not relate to group String dataDir = context.config.dataPath + "/" diff --git a/regression-test/suites/cold_heat_separation/cold_data_compaction_by_hdfs.groovy b/regression-test/suites/cold_heat_separation/cold_data_compaction_by_hdfs.groovy new file mode 100644 index 00000000000000..e2fba24ec235a0 --- /dev/null +++ b/regression-test/suites/cold_heat_separation/cold_data_compaction_by_hdfs.groovy @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.hadoop.fs.Path +import java.util.function.Supplier + +suite("test_cold_data_compaction_by_hdfs") { + + if (!enableHdfs()) { + logger.info("Skip this case, because HDFS is not available") + return + } + + def retryUntilTimeout = { int timeoutSecond, Supplier closure -> + long start = System.currentTimeMillis() + while (true) { + if (closure.get()) { + return + } else { + if (System.currentTimeMillis() - start > timeoutSecond * 1000) { + throw new RuntimeException("" + + "Operation timeout, maybe you need to check " + + "remove_unused_remote_files_interval_sec and " + + "cold_data_compaction_interval_sec in be.conf") + } else { + sleep(10_000) + } + } + } + } + + String suffix = UUID.randomUUID().hashCode().abs().toString() + String prefix = "${getHdfsDataDir()}/regression/cold_data_compaction" + multi_sql """ + DROP TABLE IF EXISTS t_recycle_in_hdfs; + DROP STORAGE POLICY IF EXISTS test_policy_${suffix}; + DROP RESOURCE IF EXISTS 'remote_hdfs_${suffix}'; + CREATE RESOURCE "remote_hdfs_${suffix}" + PROPERTIES + ( + "type"="hdfs", + "fs.defaultFS"="${getHdfsFs()}", + "hadoop.username"="${getHdfsUser()}", + "hadoop.password"="${getHdfsPasswd()}", + "root_path"="${prefix}" + ); + CREATE STORAGE POLICY test_policy_${suffix} + PROPERTIES( + "storage_resource" = "remote_hdfs_${suffix}", + "cooldown_ttl" = "5" + ); + CREATE TABLE IF NOT EXISTS t_recycle_in_hdfs + ( + k1 BIGINT, + k2 LARGEINT, + v1 VARCHAR(2048) + ) + DISTRIBUTED BY HASH (k1) BUCKETS 1 + PROPERTIES( + "storage_policy" = "test_policy_${suffix}", + "disable_auto_compaction" = "true", + "replication_num" = "1" + ); + """ + + // insert 5 RowSets + multi_sql """ + insert into t_recycle_in_hdfs values(1, 1, 'Tom'); + insert into t_recycle_in_hdfs values(2, 2, 'Jelly'); + insert into t_recycle_in_hdfs values(3, 3, 'Spike'); + insert into t_recycle_in_hdfs values(4, 4, 'Tyke'); + insert into t_recycle_in_hdfs values(5, 5, 'Tuffy'); + """ + + // wait until files upload to S3 + retryUntilTimeout(1800, { + def res = sql_return_maparray "show data from t_recycle_in_hdfs" + String size = "" + String remoteSize = "" + for (final def line in res) { + if ("t_recycle_in_hdfs".equals(line.TableName)) { + size = line.Size + remoteSize = line.RemoteSize + break + } + } + logger.info("waiting for data to be uploaded to HDFS: t_recycle_in_hdfs's local data size: ${size}, remote data size: ${remoteSize}") + return size.startsWith("0") && !remoteSize.startsWith("0") + }) + + String tabletId = sql_return_maparray("show tablets from t_recycle_in_hdfs")[0].TabletId + // check number of remote files + def filesBeforeCompaction = getHdfs().listStatus(new Path(prefix + "/data/${tabletId}")) + + // 5 RowSets + 1 meta + assertEquals(6, filesBeforeCompaction.size()) + + // trigger cold data compaction + sql """alter table t_recycle_in_hdfs set ("disable_auto_compaction" = "false")""" + + // wait until compaction finish + retryUntilTimeout(1800, { + def filesAfterCompaction = getHdfs().listStatus(new Path(prefix + "/data/${tabletId}")) + logger.info("t_recycle_in_hdfs's remote file number is ${filesAfterCompaction.size()}") + // 1 RowSet + 1 meta + return filesAfterCompaction.size() == 2 + }) + + sql "drop table t_recycle_in_hdfs force" + retryUntilTimeout(1800, { + def pathExists = getHdfs().exists(new Path(prefix + "/data/${tabletId}")) + logger.info("after drop t_recycle_in_hdfs, the remote file path ${pathExists ? "exists" : "not exists"}") + return !pathExists + }) +}