From 13da8d3704bc1212314b414eaf817b20ff2a34f0 Mon Sep 17 00:00:00 2001 From: Pavel Ivanov Date: Tue, 2 Jul 2024 19:26:09 +0000 Subject: [PATCH 1/2] [CBO] CountMin fix --- .../kqp/opt/kqp_statistics_transformer.cpp | 8 ++ ydb/library/yql/core/yql_statistics.h | 1 + .../dq/opt/dq_opt_predicate_selectivity.cpp | 129 +++++++----------- 3 files changed, 62 insertions(+), 76 deletions(-) diff --git a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp index ef1b50a6ae67..2b0a32aeadd0 100644 --- a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp +++ b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp @@ -40,6 +40,7 @@ void InferStatisticsForReadTable(const TExprNode::TPtr& input, TTypeAnnotationCo const auto& tableData = kqpCtx.Tables->ExistingTable(kqpCtx.Cluster, path->Content()); int totalAttrs = tableData.Metadata->Columns.size(); nRows = tableData.Metadata->RecordsCount; + double byteSize = tableData.Metadata->DataSize * (nAttrs / (double)totalAttrs); auto keyColumns = TIntrusivePtr(new TOptimizerStatistics::TKeyColumns(tableData.Metadata->KeyColumnNames)); @@ -47,6 +48,13 @@ void InferStatisticsForReadTable(const TExprNode::TPtr& input, TTypeAnnotationCo if (kqpCtx.Config->OverrideStatistics.Get()) { stats = OverrideStatistics(*stats, path->Content(), *kqpCtx.Config->OverrideStatistics.Get()); } + + if (stats->ColumnStatistics) { + for (const auto& [columnName, metaData]: tableData.Metadata->Columns) { + stats->ColumnStatistics->Data[columnName].Type = metaData.Type; + } + } + YQL_CLOG(TRACE, CoreDq) << "Infer statistics for read table, nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols; typeCtx->SetStats(input.Get(), stats); diff --git a/ydb/library/yql/core/yql_statistics.h b/ydb/library/yql/core/yql_statistics.h index c7298b81ce46..5141d30dcef5 100644 --- a/ydb/library/yql/core/yql_statistics.h +++ b/ydb/library/yql/core/yql_statistics.h @@ -27,6 +27,7 @@ struct TColumnStatistics { std::optional NumUniqueVals; std::optional HyperLogLog; std::shared_ptr CountMinSketch; + TString Type; TColumnStatistics() {} }; diff --git a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp index 77a6d1c98b18..6cf7ebcf54d5 100644 --- a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp @@ -54,90 +54,66 @@ namespace { } } - std::optional EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, const std::shared_ptr& countMinSketch) { + std::optional EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, TString columnType, const std::shared_ptr& countMinSketch) { if (auto maybeJust = maybeLiteral.Maybe() ) { maybeLiteral = maybeJust.Cast().Input(); } if (maybeLiteral.Maybe()) { auto literal = maybeLiteral.Maybe().Cast(); - - auto type = literal.Ref().GetTypeAnn(); - auto slot = type->Cast()->GetSlot(); auto value = literal.Literal().Value(); - switch (slot) { - case NYql::NUdf::EDataSlot::Bool: { - ui8 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Uint8: { - ui8 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Int8: { - i8 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Uint32: { - ui32 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Int32: { - i32 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Uint64: { - ui64 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Int64: { - i64 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Float: { - float v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Double: { - double v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Date: { - ui16 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Datetime: { - ui32 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Utf8: - case NYql::NUdf::EDataSlot::String: - case NYql::NUdf::EDataSlot::Yson: - case NYql::NUdf::EDataSlot::Json: { - return countMinSketch->Probe(value.Data(), value.Size()); - } - case NYql::NUdf::EDataSlot::Interval: - case NYql::NUdf::EDataSlot::Timestamp64: - case NYql::NUdf::EDataSlot::Interval64: { - i64 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Timestamp: { - ui64 v = FromString(value); - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - case NYql::NUdf::EDataSlot::Uuid: { - const ui64* uuidData = reinterpret_cast(value.Data()); + if (columnType == "Bool") { + ui8 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Uint8") { + ui8 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Int8") { + i8 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Uint32") { + ui32 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Int32") { + i32 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Uint64") { + ui64 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Int64") { + i64 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Float") { + float v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Double") { + double v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Date") { + ui16 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Datetime") { + ui32 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Utf8" || columnType == "String" || columnType == "Yson" || columnType == "Json") { + return countMinSketch->Probe(value.Data(), value.Size()); + } else if (columnType == "Interval" || columnType == "Timestamp64" || columnType == "Interval64") { + i64 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Timestamp") { + ui64 v = FromString(value); + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else if (columnType == "Uuid") { + const ui64* uuidData = reinterpret_cast(value.Data()); + std::pair v{}; + v.first = uuidData[0]; // low128 + v.second = uuidData[1]; // high128 + return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); + } else { + return std::nullopt; + } - std::pair v{}; - v.first = uuidData[0]; // low128 - v.second = uuidData[1]; // high128 - return countMinSketch->Probe(reinterpret_cast(&v), sizeof(v)); - } - default: - return std::nullopt; - } } return std::nullopt; @@ -166,7 +142,8 @@ namespace { } if (auto countMinSketch = stats->ColumnStatistics->Data[attributeName].CountMinSketch; countMinSketch != nullptr) { - std::optional countMinEstimation = EstimateCountMin(right, countMinSketch); + auto columnType = stats->ColumnStatistics->Data[attributeName].Type; + std::optional countMinEstimation = EstimateCountMin(right, columnType, countMinSketch); if (!countMinEstimation.has_value()) { return DefaultSelectivity(stats, attributeName); } From 02ec4e16227d33b324121ff6fdb34c90d1676c33 Mon Sep 17 00:00:00 2001 From: Pavel Ivanov Date: Mon, 8 Jul 2024 15:49:19 +0000 Subject: [PATCH 2/2] [CBO] Warning FIX --- ydb/core/kqp/opt/kqp_opt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/kqp/opt/kqp_opt.h b/ydb/core/kqp/opt/kqp_opt.h index b74b4b31fbff..7e8181a85a8f 100644 --- a/ydb/core/kqp/opt/kqp_opt.h +++ b/ydb/core/kqp/opt/kqp_opt.h @@ -24,8 +24,8 @@ struct TKqpOptimizeContext : public TSimpleRefCount { const NYql::TKikimrConfiguration::TPtr Config; const TIntrusivePtr QueryCtx; const TIntrusivePtr Tables; - int JoinsCount; - int EquiJoinsCount; + int JoinsCount{}; + int EquiJoinsCount{}; bool IsDataQuery() const { return QueryCtx->Type == NYql::EKikimrQueryType::Dml;