From 33bfbf06c616e27c60eab708f43da4068b5e818b Mon Sep 17 00:00:00 2001 From: meiyi Date: Thu, 13 Jun 2024 19:38:14 +0800 Subject: [PATCH 1/2] [fix](group commit) make group commit cancel in time (#36249) ## Proposed changes If group commit time interval is larger than the load timeout, and there is no new client load to reuse the internal group commit load, the group commit can not cancel in time because it stuck in wait: ``` #0 0x00007f33937a47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00005651105dbd05 in __gthread_cond_timedwait(pthread_cond_t*, pthread_mutex_t*, timespec const*) () #2 0x000056511063f385 in std::__condvar::wait_until(std::mutex&, timespec&) () #3 0x000056511063dc2e in std::cv_status std::condition_variable::__wait_until_impl > >(std::unique_lock&, std::chrono::time_point > > const&) () #4 0x000056511063cedf in std::cv_status std::condition_variable::wait_until > >(std::unique_lock&, std::chrono::time_point > > const&) () #5 0x0000565110824f48 in std::cv_status std::condition_variable::wait_for >(std::unique_lock&, std::chrono::duration > const&) () #6 0x0000565113b5612a in doris::LoadBlockQueue::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*, bool*) () #7 0x000056513f900941 in doris::pipeline::GroupCommitOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) () #8 0x000056513c69c0b6 in doris::pipeline::ScanOperatorX::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) () #9 0x000056514009d5f1 in doris::pipeline::PipelineTask::execute(bool*) () #10 0x00005651400fb24a in doris::pipeline::TaskScheduler::_do_work(unsigned long) () ``` --- be/src/runtime/group_commit_mgr.cpp | 2 +- .../test_group_commit_timeout.groovy | 55 +++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 regression-test/suites/insert_p0/test_group_commit_timeout.groovy diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index d5daf2af5308a8..16f7c3a4d53c30 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -142,7 +142,7 @@ Status LoadBlockQueue::get_block(RuntimeState* runtime_state, vectorized::Block* << ", runtime_state=" << runtime_state; } } - _get_cond.wait_for(l, std::chrono::milliseconds(left_milliseconds)); + _get_cond.wait_for(l, std::chrono::milliseconds(std::min(left_milliseconds, 10000L))); } if (runtime_state->is_cancelled()) { auto st = Status::Cancelled(runtime_state->cancel_reason()); diff --git a/regression-test/suites/insert_p0/test_group_commit_timeout.groovy b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy new file mode 100644 index 00000000000000..7866a33df0ef86 --- /dev/null +++ b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_group_commit_timeout", "nonConcurrent") { + def tableName = "test_group_commit_timeout" + sql """ + CREATE TABLE if not exists ${tableName} ( + `id` int(11) NOT NULL, + `name` varchar(100) NULL, + `score` int(11) NULL default "-1" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "group_commit_interval_ms" = "300000" + ); + """ + + def query_timeout = sql """show variables where variable_name = 'query_timeout';""" + def insert_timeout = sql """show variables where variable_name = 'insert_timeout';""" + logger.info("query_timeout: ${query_timeout}, insert_timeout: ${insert_timeout}") + + long start = System.currentTimeMillis() + try { + sql "SET global query_timeout = 5" + sql "SET global insert_timeout = 5" + + sql "set group_commit = sync_mode" + sql "insert into ${tableName} values(1, 'a', 10)" + assertTrue(false) + } catch (Exception e) { + long end = System.currentTimeMillis() + logger.info("failed " + e.getMessage()) + assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance")) + assertTrue(end - start <= 60000) + } finally { + sql "SET global query_timeout = ${query_timeout[0][1]}" + sql "SET global insert_timeout = ${insert_timeout[0][1]}" + } +} From 7fd22c5faba6230c788f5b80f5637a7fc3324149 Mon Sep 17 00:00:00 2001 From: meiyi Date: Mon, 8 Jul 2024 10:45:20 +0800 Subject: [PATCH 2/2] f --- .../suites/insert_p0/test_group_commit_timeout.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-test/suites/insert_p0/test_group_commit_timeout.groovy b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy index 7866a33df0ef86..bd0bcfcdeb3538 100644 --- a/regression-test/suites/insert_p0/test_group_commit_timeout.groovy +++ b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy @@ -46,7 +46,7 @@ suite("test_group_commit_timeout", "nonConcurrent") { } catch (Exception e) { long end = System.currentTimeMillis() logger.info("failed " + e.getMessage()) - assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance")) + assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance") || e.getMessage().contains("CANCELLED")) assertTrue(end - start <= 60000) } finally { sql "SET global query_timeout = ${query_timeout[0][1]}"