From 70a262a775f2ba5ea7eabf4a199f66ef66f607b3 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 13 Sep 2023 16:02:57 +0800 Subject: [PATCH 01/51] Add optimization uniq to count --- src/Analyzer/Passes/UniqToCountPass.cpp | 195 ++++++++++++++ src/Analyzer/Passes/UniqToCountPass.h | 30 +++ src/Analyzer/QueryTreePassManager.cpp | 2 + src/Core/Settings.h | 1 + src/Interpreters/InterpreterSelectQuery.cpp | 7 + .../RewriteUniqToCountVisitor.cpp | 163 +++++++++++ src/Interpreters/RewriteUniqToCountVisitor.h | 30 +++ tests/performance/uniq_to_count.xml | 8 + ...8_distinct_to_count_optimization.reference | 252 ++++++++++++++++++ .../02868_distinct_to_count_optimization.sql | 68 +++++ 10 files changed, 756 insertions(+) create mode 100644 src/Analyzer/Passes/UniqToCountPass.cpp create mode 100644 src/Analyzer/Passes/UniqToCountPass.h create mode 100644 src/Interpreters/RewriteUniqToCountVisitor.cpp create mode 100644 src/Interpreters/RewriteUniqToCountVisitor.h create mode 100644 tests/performance/uniq_to_count.xml create mode 100644 tests/queries/0_stateless/02868_distinct_to_count_optimization.reference create mode 100644 tests/queries/0_stateless/02868_distinct_to_count_optimization.sql diff --git a/src/Analyzer/Passes/UniqToCountPass.cpp b/src/Analyzer/Passes/UniqToCountPass.cpp new file mode 100644 index 000000000000..4373918a8cca --- /dev/null +++ b/src/Analyzer/Passes/UniqToCountPass.cpp @@ -0,0 +1,195 @@ +#include "UniqToCountPass.h" + +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +bool matchFnUniq(String func_name) +{ + auto name = Poco::toLower(func_name); + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; +} + +/// Extract the corresponding projection columns for group by node list. +/// For example: +/// SELECT a as aa, any(b) FROM table group by a; -> aa(ColumnNode) +NamesAndTypes extractProjectionColumnsForGroupBy(const QueryNode * query_node) +{ + if (!query_node->hasGroupBy()) + return {}; + + NamesAndTypes result; + for (const auto & group_by_ele : query_node->getGroupByNode()->getChildren()) + { + const auto & projection_columns = query_node->getProjectionColumns(); + const auto & projection_nodes = query_node->getProjection().getNodes(); + + assert(projection_columns.size() == projection_nodes.size()); + + for (size_t i = 0; i < projection_columns.size(); i++) + { + if (projection_nodes[i]->isEqual(*group_by_ele)) + result.push_back(projection_columns[i]); + } + } + return result; +} + +/// Whether query_columns equals subquery_columns. +/// query_columns: query columns from query +/// subquery_columns: projection columns from subquery +bool nodeListEquals(const QueryTreeNodes & query_columns, const NamesAndTypes & subquery_columns) +{ + if (query_columns.size() != subquery_columns.size()) + return false; + + for (const auto & query_column : query_columns) + { + auto find = std::find_if( + subquery_columns.begin(), + subquery_columns.end(), + [&](const auto & subquery_column) -> bool + { + if (auto * column_node = query_column->as()) + { + return subquery_column == column_node->getColumn(); + } + return false; + }); + + if (find == subquery_columns.end()) + return false; + } + return true; +} + +/// Whether subquery_columns contains all columns in subquery_columns. +/// query_columns: query columns from query +/// subquery_columns: projection columns from subquery +bool nodeListContainsAll(const QueryTreeNodes & query_columns, const NamesAndTypes & subquery_columns) +{ + if (query_columns.size() > subquery_columns.size()) + return false; + + for (const auto & query_column : query_columns) + { + auto find = std::find_if( + subquery_columns.begin(), + subquery_columns.end(), + [&](const auto & subquery_column) -> bool + { + if (auto * column_node = query_column->as()) + { + return subquery_column == column_node->getColumn(); + } + return false; + }); + + if (find == subquery_columns.end()) + return false; + } + return true; +} + +} + +class UniqToCountVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + void enterImpl(QueryTreeNodePtr & node) + { + if (!getSettings().optimize_uniq_to_count) + return; + + auto * query_node = node->as(); + if (!query_node) + return; + + /// Check that query has only single table expression which is subquery + auto * subquery_node = query_node->getJoinTree()->as(); + if (!subquery_node) + return; + + /// Check that query has only single node in projection + auto & projection_nodes = query_node->getProjection().getNodes(); + if (projection_nodes.size() != 1) + return; + + /// Check that projection_node is a function + auto & projection_node = projection_nodes[0]; + auto * function_node = projection_node->as(); + if (!function_node) + return; + + /// Check that query single projection node is `uniq` or its variants + if (!matchFnUniq(function_node->getFunctionName())) + return; + + auto & uniq_arguments_nodes = function_node->getArguments().getNodes(); + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' + auto match_subquery_with_distinct = [&]() -> bool + { + if (!subquery_node->isDistinct()) + return false; + + /// uniq expression list == subquery projection columns + if (!nodeListEquals(uniq_arguments_nodes, subquery_node->getProjectionColumns())) + return false; + + return true; + }; + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' + auto match_subquery_with_group_by = [&]() -> bool + { + if (!subquery_node->hasGroupBy()) + return false; + + /// uniq argument node list == subquery group by node list + auto group_by_columns = extractProjectionColumnsForGroupBy(subquery_node); + + if (!nodeListEquals(uniq_arguments_nodes, group_by_columns)) + return false; + + /// subquery projection columns must contain all columns in uniq argument node list + if (!nodeListContainsAll(uniq_arguments_nodes, subquery_node->getProjectionColumns())) + return false; + + return true; + }; + + /// Replace uniq of initial query to count + if (match_subquery_with_distinct() || match_subquery_with_group_by()) + { + AggregateFunctionProperties properties; + auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties); + + function_node->getArguments().getNodes().clear(); + function_node->resolveAsAggregateFunction(std::move(aggregate_function)); + } + } +}; + + +void UniqToCountPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +{ + UniqToCountVisitor visitor(context); + visitor.visit(query_tree_node); +} + +} diff --git a/src/Analyzer/Passes/UniqToCountPass.h b/src/Analyzer/Passes/UniqToCountPass.h new file mode 100644 index 000000000000..4992d524e5e7 --- /dev/null +++ b/src/Analyzer/Passes/UniqToCountPass.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +namespace DB +{ + +/** Optimize `uniq` and its variants(except uniqUpTo) into `count` over subquery. + * Example: 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' to + * Result: 'SELECT count() FROM (SELECT DISTINCT x ...)' + * + * Example: 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' to + * Result: 'SELECT count() FROM (SELECT x ... GROUP BY x ...)' + * + * Note that we can rewrite all uniq variants except uniqUpTo. + */ +class UniqToCountPass final : public IQueryTreePass +{ +public: + String getName() override { return "UniqToCount"; } + + String getDescription() override + { + return "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause."; + } + + void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; +}; + +} diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index cd3abd9593e7..2e4a32bddf6f 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -247,6 +248,7 @@ void addQueryTreePasses(QueryTreePassManager & manager) manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); + manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 06d294054e03..c5633856ade0 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -778,6 +778,7 @@ class IColumn; M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \ M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ + M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index de2d34162a8c..cc0f2bf72838 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -421,6 +422,12 @@ InterpreterSelectQuery::InterpreterSelectQuery( RewriteCountDistinctFunctionVisitor(data_rewrite_countdistinct).visit(query_ptr); } + if (settings.optimize_uniq_to_count) + { + RewriteUniqToCountMatcher::Data data_rewrite_uniq_count; + RewriteUniqToCountVisitor(data_rewrite_uniq_count).visit(query_ptr); + } + JoinedTables joined_tables(getSubqueryContext(context), getSelectQuery(), options.with_all_cols, options_.is_create_parameterized_view); bool got_storage_from_query = false; diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp new file mode 100644 index 000000000000..7445068207ab --- /dev/null +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -0,0 +1,163 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using Aliases = std::unordered_map; + +namespace +{ + +bool matchFnUniq(String func_name) +{ + auto name = Poco::toLower(func_name); + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; +} + +bool expressionEquals(const ASTPtr & lhs, const ASTPtr & rhs, const Aliases & alias) +{ + if (lhs->getTreeHash() == rhs->getTreeHash()) + { + return true; + } + else + { + auto * lhs_idf = lhs->as(); + auto * rhs_idf = rhs->as(); + if (lhs_idf && rhs_idf) + { + /// compound identifiers, such as: + if (lhs_idf->shortName() == rhs_idf->shortName()) + return true; + + /// translate alias + if (alias.find(lhs_idf->shortName()) != alias.end()) + lhs_idf = alias.find(lhs_idf->shortName())->second->as(); + + if (alias.find(rhs_idf->shortName()) != alias.end()) + rhs_idf = alias.find(rhs_idf->shortName())->second->as(); + + if (lhs_idf->shortName() == rhs_idf->shortName()) + return true; + } + } + return false; +} + +bool expressionListEquals(ASTExpressionList * lhs, ASTExpressionList * rhs, const Aliases & alias) +{ + if (!lhs || !rhs) + return false; + if (lhs->children.size() != rhs->children.size()) + return false; + for (size_t i = 0; i < lhs->children.size(); i++) + { + if (!expressionEquals(lhs->children[i], rhs->children[i], alias)) + return false; + } + return true; +} + +/// Test whether lhs contains all expressions in rhs. +bool expressionListContainsAll(ASTExpressionList * lhs, ASTExpressionList * rhs, const Aliases & alias) +{ + if (!lhs || !rhs) + return false; + if (lhs->children.size() < rhs->children.size()) + return false; + for (const auto & re : rhs->children) + { + auto predicate = [&re, &alias](ASTPtr & le) { return expressionEquals(le, re, alias); }; + if (std::find_if(lhs->children.begin(), lhs->children.end(), predicate) == lhs->children.end()) + return false; + } + return true; +} + +} + +void RewriteUniqToCountMatcher::visit(ASTPtr & ast, Data & /*data*/) +{ + auto * selectq = ast->as(); + if (!selectq || !selectq->tables() || selectq->tables()->children.size() != 1) + return; + auto expr_list = selectq->select(); + if (!expr_list || expr_list->children.size() != 1) + return; + auto * func = expr_list->children[0]->as(); + if (!func || !matchFnUniq(func->name)) + return; + if (selectq->tables()->as()->children[0]->as()->children.size() != 1) + return; + auto * table_expr = selectq->tables() + ->as() + ->children[0] + ->as() + ->children[0] + ->as(); + if (!table_expr || table_expr->children.size() != 1 || !table_expr->subquery) + return; + auto * subquery = table_expr->subquery->as(); + if (!subquery) + return; + auto * sub_selectq = subquery->children[0] + ->as()->children[0] + ->as()->children[0] + ->as(); + if (!sub_selectq) + return; + auto sub_expr_list = sub_selectq->select(); + if (!sub_expr_list) + return; + + /// collect subquery select expressions alias + Aliases alias; + for (const auto & expr : sub_expr_list->children) + { + if (!expr->tryGetAlias().empty()) + alias.insert({expr->tryGetAlias(), expr}); + } + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' + auto match_subquery_with_distinct = [&]() -> bool + { + if (!sub_selectq->distinct) + return false; + /// uniq expression list == subquery group by expression list + if (!expressionListEquals(func->children[0]->as(), sub_expr_list->as(), alias)) + return false; + return true; + }; + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' + auto match_subquery_with_group_by = [&]() -> bool + { + auto group_by = sub_selectq->groupBy(); + if (!group_by) + return false; + /// uniq expression list == subquery group by expression list + if (!expressionListEquals(func->children[0]->as(), group_by->as(), alias)) + return false; + /// subquery select expression list must contain all columns in uniq expression list + if (!expressionListContainsAll(sub_expr_list->as(), func->children[0]->as(), alias)) + return false; + return true; + }; + + if (match_subquery_with_distinct() || match_subquery_with_group_by()) + expr_list->children[0] = makeASTFunction("count"); +} + +} diff --git a/src/Interpreters/RewriteUniqToCountVisitor.h b/src/Interpreters/RewriteUniqToCountVisitor.h new file mode 100644 index 000000000000..94528ccf2ee3 --- /dev/null +++ b/src/Interpreters/RewriteUniqToCountVisitor.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include "Interpreters/TreeRewriter.h" + +namespace DB +{ + +class ASTFunction; + +/** Optimize `uniq` into `count` over subquery. + * Example: 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' to + * Result: 'SELECT count() FROM (SELECT DISTINCT x ...)' + * + * Example: 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' to + * Result: 'SELECT count() FROM (SELECT x ... GROUP BY x ...)' + * + * Note that we can rewrite all uniq variants except uniqUpTo. + */ +class RewriteUniqToCountMatcher +{ +public: + struct Data {}; + static void visit(ASTPtr & ast, Data &); + static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; } +}; + +using RewriteUniqToCountVisitor = InDepthNodeVisitor; +} diff --git a/tests/performance/uniq_to_count.xml b/tests/performance/uniq_to_count.xml new file mode 100644 index 000000000000..64e4cf1cc0d4 --- /dev/null +++ b/tests/performance/uniq_to_count.xml @@ -0,0 +1,8 @@ + + select uniq(number) from (select DISTINCT number from numbers(1000000)) + select uniq(number) from (select number from numbers(1000000) group by number) + + + select uniq(number) from (select DISTINCT number from numbers(1000000)) SETTINGS allow_experimental_analyzer=1 + select uniq(number) from (select number from numbers(1000000) group by number) SETTINGS allow_experimental_analyzer=1 + diff --git a/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference b/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference new file mode 100644 index 000000000000..b2b15f921990 --- /dev/null +++ b/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference @@ -0,0 +1,252 @@ +1. test simple distinct +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +2. test distinct with subquery alias +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +3. test distinct with compound column name +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +4. test distinct with select expression alias +3 +SELECT count() +FROM +( + SELECT DISTINCT a AS alias_of_a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +5. test simple group by +3 +SELECT count() +FROM +( + SELECT + a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, is_subquery: 1 + PROJECTION COLUMNS + a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +6. test group by with subquery alias +3 +SELECT count() +FROM +( + SELECT + a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +7. test group by with compound column name +3 +SELECT count() +FROM +( + SELECT + a AS alias_of_a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +8. test group by with select expression alias +3 +SELECT count() +FROM +( + SELECT + a AS alias_of_a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY alias_of_a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 diff --git a/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql b/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql new file mode 100644 index 000000000000..66431b7c36bd --- /dev/null +++ b/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql @@ -0,0 +1,68 @@ +drop table if exists test_rewrite_uniq_to_count; + +CREATE TABLE test_rewrite_uniq_to_count +( + `a` UInt8, + `b` UInt8, + `c` UInt8 +) ENGINE = MergeTree ORDER BY `a`; + + +INSERT INTO test_rewrite_uniq_to_count values ('1', '1', '1'), ('1', '1', '1'); +INSERT INTO test_rewrite_uniq_to_count values ('2', '2', '2'), ('2', '2', '2'); +INSERT INTO test_rewrite_uniq_to_count values ('3', '3', '3'), ('3', '3', '3'); + +set optimize_uniq_to_count=true; + + +SELECT '1. test simple distinct'; +SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=1; + + +SELECT '2. test distinct with subquery alias'; +SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + +SELECT '3. test distinct with compound column name'; +SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + +SELECT '4. test distinct with select expression alias'; +SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + + +SELECT '5. test simple group by'; +SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=1; + +SELECT '6. test group by with subquery alias'; +SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; + +SELECT '7. test group by with compound column name'; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; + +SELECT '8. test group by with select expression alias'; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=1; + +drop table if exists test_rewrite_uniq_to_count; From ebde8cd49530bd8c05cad5b676d4e37a2c49d9a5 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 21 Sep 2023 10:09:47 +0800 Subject: [PATCH 02/51] Fix NPE --- src/Interpreters/RewriteUniqToCountVisitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp index 7445068207ab..73824b618738 100644 --- a/src/Interpreters/RewriteUniqToCountVisitor.cpp +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -49,7 +49,7 @@ bool expressionEquals(const ASTPtr & lhs, const ASTPtr & rhs, const Aliases & al if (alias.find(rhs_idf->shortName()) != alias.end()) rhs_idf = alias.find(rhs_idf->shortName())->second->as(); - if (lhs_idf->shortName() == rhs_idf->shortName()) + if (lhs_idf && rhs_idf && lhs_idf->shortName() == rhs_idf->shortName()) return true; } } From 332cb47af857ef973bd3781cdc835360a706f34e Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 25 Sep 2023 10:22:56 +0800 Subject: [PATCH 03/51] compare function name in lower case --- src/Analyzer/Passes/UniqToCountPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/UniqToCountPass.cpp b/src/Analyzer/Passes/UniqToCountPass.cpp index 4373918a8cca..ba2c3f05522f 100644 --- a/src/Analyzer/Passes/UniqToCountPass.cpp +++ b/src/Analyzer/Passes/UniqToCountPass.cpp @@ -17,8 +17,8 @@ namespace bool matchFnUniq(String func_name) { auto name = Poco::toLower(func_name); - return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" - || name == "uniqCombined64"; + return name == "uniq" || name == "uniqhll12" || name == "uniqexact" || name == "uniqtheta" || name == "uniqcombined" + || name == "uniqcombined64"; } /// Extract the corresponding projection columns for group by node list. From d03bf3d53a595abd6d8dc8e28a77fa35a65d00a5 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 26 Sep 2023 09:33:51 +0800 Subject: [PATCH 04/51] use lower case --- src/Interpreters/RewriteUniqToCountVisitor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp index 73824b618738..31d5a8d85cad 100644 --- a/src/Interpreters/RewriteUniqToCountVisitor.cpp +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -22,8 +22,8 @@ namespace bool matchFnUniq(String func_name) { auto name = Poco::toLower(func_name); - return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" - || name == "uniqCombined64"; + return name == "uniq" || name == "uniqhll12" || name == "uniqexact" || name == "uniqtheta" || name == "uniqcombined" + || name == "uniqcombined64"; } bool expressionEquals(const ASTPtr & lhs, const ASTPtr & rhs, const Aliases & alias) From c40558a961afa04969604a999a11787c8c76ec0f Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 27 Sep 2023 09:14:24 +0800 Subject: [PATCH 05/51] Fix uniq function name --- src/Analyzer/Passes/UniqToCountPass.cpp | 7 +++---- src/Interpreters/RewriteUniqToCountVisitor.cpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/Analyzer/Passes/UniqToCountPass.cpp b/src/Analyzer/Passes/UniqToCountPass.cpp index ba2c3f05522f..271f12a903a9 100644 --- a/src/Analyzer/Passes/UniqToCountPass.cpp +++ b/src/Analyzer/Passes/UniqToCountPass.cpp @@ -14,11 +14,10 @@ namespace DB namespace { -bool matchFnUniq(String func_name) +bool matchFnUniq(String name) { - auto name = Poco::toLower(func_name); - return name == "uniq" || name == "uniqhll12" || name == "uniqexact" || name == "uniqtheta" || name == "uniqcombined" - || name == "uniqcombined64"; + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; } /// Extract the corresponding projection columns for group by node list. diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp index 31d5a8d85cad..539271aa9979 100644 --- a/src/Interpreters/RewriteUniqToCountVisitor.cpp +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -19,11 +19,10 @@ using Aliases = std::unordered_map; namespace { -bool matchFnUniq(String func_name) +bool matchFnUniq(String name) { - auto name = Poco::toLower(func_name); - return name == "uniq" || name == "uniqhll12" || name == "uniqexact" || name == "uniqtheta" || name == "uniqcombined" - || name == "uniqcombined64"; + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; } bool expressionEquals(const ASTPtr & lhs, const ASTPtr & rhs, const Aliases & alias) From de71bfd18b2df90100d10bf2291dfae1571b75f1 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 24 Sep 2023 03:14:30 +0800 Subject: [PATCH 06/51] Introduce -ArgMin/-ArgMax combinators. --- .../AggregateFunctionCombinatorMinMax.cpp | 92 +++++++++++++++ .../AggregateFunctionCombinatorMinMax.h | 111 ++++++++++++++++++ .../registerAggregateFunctions.cpp | 2 + 3 files changed, 205 insertions(+) create mode 100644 src/AggregateFunctions/AggregateFunctionCombinatorMinMax.cpp create mode 100644 src/AggregateFunctions/AggregateFunctionCombinatorMinMax.h diff --git a/src/AggregateFunctions/AggregateFunctionCombinatorMinMax.cpp b/src/AggregateFunctions/AggregateFunctionCombinatorMinMax.cpp new file mode 100644 index 000000000000..882008b303fd --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCombinatorMinMax.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ +template