From 1fea718a57b684dc8740bfd205dacd6ae0ad48fd Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Thu, 21 Aug 2025 17:05:23 +0800 Subject: [PATCH 01/16] enhance max_sparse_column_statistics_size for variant --- be/src/common/config.cpp | 2 -- be/src/common/config.h | 3 -- be/src/olap/rowset/segment_v2/segment.cpp | 3 +- .../variant/variant_column_reader.cpp | 13 ++++---- .../variant/variant_column_reader.h | 2 +- .../variant/variant_column_writer_impl.cpp | 2 +- .../segment_v2/variant_stats_calculator.cpp | 12 +++++-- .../segment_v2/variant_stats_calculator.h | 4 ++- be/src/olap/tablet_meta.cpp | 4 +++ be/src/olap/tablet_schema.cpp | 6 ++++ be/src/olap/tablet_schema.h | 11 +++++++ be/src/vec/common/schema_util.cpp | 15 +++++---- be/src/vec/common/schema_util.h | 3 +- .../variant_column_writer_reader_test.cpp | 13 +++++--- be/test/vec/common/schema_util_test.cpp | 8 +++-- .../org/apache/doris/catalog/ScalarType.java | 9 ++++++ .../org/apache/doris/catalog/VariantType.java | 13 +++++++- .../java/org/apache/doris/catalog/Column.java | 9 ++++++ .../doris/common/util/PropertyAnalyzer.java | 29 +++++++++++++++++ .../nereids/parser/LogicalPlanBuilder.java | 10 ++++-- .../functions/scalar/GetVariantType.java | 2 +- .../apache/doris/nereids/types/DataType.java | 3 +- .../doris/nereids/types/VariantType.java | 32 +++++++++++++++---- .../org/apache/doris/qe/SessionVariable.java | 14 ++++++++ .../functions/ComputeSignatureHelperTest.java | 26 +++++++++------ .../apache/doris/persist/ScalarTypeTest.java | 1 + gensrc/proto/olap_file.proto | 2 ++ gensrc/thrift/Descriptors.thrift | 1 + ...ariant_compaction_with_sparse_limit.groovy | 10 +++--- ...ariant_compaction_with_sparse_limit.groovy | 6 ++-- 30 files changed, 206 insertions(+), 62 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index c7eb4136c52a7e..940d72779df270 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1350,8 +1350,6 @@ DEFINE_Bool(enable_snapshot_action, "false"); DEFINE_mInt32(variant_max_merged_tablet_schema_size, "2048"); -DEFINE_mInt32(variant_max_sparse_column_statistics_size, "10000"); - DEFINE_mBool(enable_column_type_check, "true"); // 128 MB DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 5f5900165b02ea..44ab31fad5ae1a 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1412,9 +1412,6 @@ DECLARE_Bool(enable_snapshot_action); // The max columns size for a tablet schema DECLARE_mInt32(variant_max_merged_tablet_schema_size); -// The max sparse column statistics size for a variant column -DECLARE_mInt32(variant_max_sparse_column_statistics_size); - DECLARE_mInt64(local_exchange_buffer_mem_limit); DECLARE_mInt64(enable_debug_log_timeout_secs); diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index bff825780129db..b7d6a9493829f1 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -629,7 +629,8 @@ vectorized::DataTypePtr Segment::get_data_type_of(const TabletColumn& column, // 2. OR It's a leaf in the physical column structure AND it doesn't *also* exist // in the sparse column (meaning it's purely a materialized leaf). if (read_flat_leaves || (is_physical_leaf && !exist_in_sparse && - !variant_reader->is_exceeded_sparse_column_limit())) { + !variant_reader->is_exceeded_sparse_column_limit( + column.variant_max_sparse_column_statistics_size()))) { return node->data.file_column_type; } return column.is_nullable() diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp index 0744b5b542845b..5eec534a702c76 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -74,10 +74,10 @@ bool VariantColumnReader::exist_in_sparse_column( return existed_in_sparse_column || prefix_existed_in_sparse_column; } -bool VariantColumnReader::is_exceeded_sparse_column_limit() const { +bool VariantColumnReader::is_exceeded_sparse_column_limit( + size_t max_sparse_column_statistics_size) const { return !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= - config::variant_max_sparse_column_statistics_size; + _statistics->sparse_column_non_null_size.size() >= max_sparse_column_statistics_size; } int64_t VariantColumnReader::get_metadata_size() const { @@ -276,9 +276,10 @@ Status VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator, // Otherwise the prefix is not exist and the sparse column size is reached limit // which means the path maybe exist in sparse_column - bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= - config::variant_max_sparse_column_statistics_size; + bool exceeded_sparse_column_limit = + !_statistics->sparse_column_non_null_size.empty() && + _statistics->sparse_column_non_null_size.size() >= + target_col->variant_max_sparse_column_statistics_size(); // If the variant column has extracted columns and is a compaction reader, then read flat leaves // Otherwise read hierarchical data, since the variant subcolumns are flattened in schema_util::VariantCompactionUtil::get_extended_compaction_schema diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h index ae3d97aeb62afd..ee7abba7c4b707 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h @@ -72,7 +72,7 @@ class VariantColumnReader : public ColumnReader { bool exist_in_sparse_column(const vectorized::PathInData& path) const; - bool is_exceeded_sparse_column_limit() const; + bool is_exceeded_sparse_column_limit(size_t max_sparse_column_statistics_size) const; const SubcolumnColumnReaders* get_subcolumn_readers() const { return _subcolumn_readers.get(); } diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp index 088d8ec0f5a161..354224f1d9a4ab 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp @@ -350,7 +350,7 @@ Status VariantColumnWriterImpl::_process_sparse_column( it != sparse_data_paths_statistics.end()) { ++it->second; } else if (sparse_data_paths_statistics.size() < - config::variant_max_sparse_column_statistics_size) { + _tablet_column->variant_max_sparse_column_statistics_size()) { sparse_data_paths_statistics.emplace(path, 1); } } diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp index 6964fef452ef8b..7567d0ff588d1c 100644 --- a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp +++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp @@ -17,6 +17,8 @@ #include "olap/rowset/segment_v2/variant_stats_calculator.h" +#include + #include "common/logging.h" #include "util/simd/bits.h" #include "vec/columns/column_nullable.h" @@ -63,7 +65,10 @@ Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* b // Check if this is a sparse column or sub column if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) { // This is a sparse column from variant column - _calculate_sparse_column_stats(*column, column_meta, row_pos, num_rows); + _calculate_sparse_column_stats( + *column, column_meta, + tablet_column.variant_max_sparse_column_statistics_size(), row_pos, + num_rows); } else { // This is a sub column from variant column _calculate_sub_column_stats(*column, column_meta, row_pos, num_rows); @@ -75,13 +80,14 @@ Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* b void VariantStatsCaculator::_calculate_sparse_column_stats(const vectorized::IColumn& column, ColumnMetaPB* column_meta, + size_t max_sparse_column_statistics_size, size_t row_pos, size_t num_rows) { // Get or create variant statistics VariantStatisticsPB* stats = column_meta->mutable_variant_statistics(); // Use the same logic as the original calculate_variant_stats function - vectorized::schema_util::VariantCompactionUtil::calculate_variant_stats(column, stats, row_pos, - num_rows); + vectorized::schema_util::VariantCompactionUtil::calculate_variant_stats( + column, stats, max_sparse_column_statistics_size, row_pos, num_rows); VLOG_DEBUG << "Sparse column stats updated, non-null size count: " << stats->sparse_column_non_null_size_size(); diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h index 6ffd74036cb5ff..221c45b781dce8 100644 --- a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h +++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h @@ -45,7 +45,9 @@ class VariantStatsCaculator { // Helper method to calculate sparse column statistics void _calculate_sparse_column_stats(const vectorized::IColumn& column, - ColumnMetaPB* column_meta, size_t row_pos, size_t num_rows); + ColumnMetaPB* column_meta, + size_t max_sparse_column_statistics_size, size_t row_pos, + size_t num_rows); // Helper method to calculate sub column statistics void _calculate_sub_column_stats(const vectorized::IColumn& column, ColumnMetaPB* column_meta, diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index ced9ad54e7b57c..e63b20ba46b85f 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -478,6 +478,10 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco column->set_variant_enable_typed_paths_to_sparse( tcolumn.variant_enable_typed_paths_to_sparse); } + if (tcolumn.__isset.variant_max_sparse_column_statistics_size) { + column->set_variant_max_sparse_column_statistics_size( + tcolumn.variant_max_sparse_column_statistics_size); + } } void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) { diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 13d6b8776a0b69..760d03c57c7bd2 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -681,6 +681,10 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { if (column.has_variant_enable_typed_paths_to_sparse()) { _variant_enable_typed_paths_to_sparse = column.variant_enable_typed_paths_to_sparse(); } + if (column.has_variant_max_sparse_column_statistics_size()) { + _variant_max_sparse_column_statistics_size = + column.variant_max_sparse_column_statistics_size(); + } if (column.has_pattern_type()) { _pattern_type = column.pattern_type(); } @@ -764,6 +768,8 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_variant_max_subcolumns_count(_variant_max_subcolumns_count); column->set_pattern_type(_pattern_type); column->set_variant_enable_typed_paths_to_sparse(_variant_enable_typed_paths_to_sparse); + column->set_variant_max_sparse_column_statistics_size( + _variant_max_sparse_column_statistics_size); } void TabletColumn::add_sub_column(TabletColumn& sub_column) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 82ec82ea99901b..3e1c1e3f3eb2b3 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -221,6 +221,11 @@ class TabletColumn : public MetadataAdder { _variant_enable_typed_paths_to_sparse = enable; } + void set_variant_max_sparse_column_statistics_size( + int32_t variant_max_sparse_column_statistics_size) { + _variant_max_sparse_column_statistics_size = variant_max_sparse_column_statistics_size; + } + int32_t variant_max_subcolumns_count() const { return _variant_max_subcolumns_count; } PatternTypePB pattern_type() const { return _pattern_type; } @@ -229,6 +234,10 @@ class TabletColumn : public MetadataAdder { return _variant_enable_typed_paths_to_sparse; } + int32_t variant_max_sparse_column_statistics_size() const { + return _variant_max_sparse_column_statistics_size; + } + bool is_decimal() const { return _is_decimal; } private: @@ -275,6 +284,8 @@ class TabletColumn : public MetadataAdder { int32_t _variant_max_subcolumns_count = 0; PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB; bool _variant_enable_typed_paths_to_sparse = false; + // set variant_max_sparse_column_statistics_size + int32_t _variant_max_sparse_column_statistics_size = 0; }; bool operator==(const TabletColumn& a, const TabletColumn& b); diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 2cb18863395d90..f8c7fdb1f13727 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -806,7 +806,8 @@ Status VariantCompactionUtil::check_path_stats(const std::vector config::variant_max_sparse_column_statistics_size) { + if (stats.size() > + tablet->tablet_schema()->column(uid).variant_max_sparse_column_statistics_size()) { // When there is only one segment, we can ensure that the size of each path in output stats is accurate if (output->num_segments() == 1) { for (const auto& [path, size] : stats) { @@ -930,7 +931,8 @@ void VariantCompactionUtil::get_compaction_subcolumns( VLOG_DEBUG << "append typed column " << subpath; } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() || sparse_paths.find(std::string(subpath)) != sparse_paths.end() || - sparse_paths.size() >= config::variant_max_sparse_column_statistics_size) { + sparse_paths.size() >= + parent_column->variant_max_sparse_column_statistics_size()) { TabletColumn subcolumn; subcolumn.set_name(column_name); subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); @@ -1028,6 +1030,7 @@ Status VariantCompactionUtil::get_extended_compaction_schema( // Calculate statistics about variant data paths from the encoded sparse column void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column, segment_v2::VariantStatisticsPB* stats, + size_t max_sparse_column_statistics_size, size_t row_pos, size_t num_rows) { // Cast input column to ColumnMap type since sparse column is stored as a map const auto& map_column = assert_cast(encoded_sparse_column); @@ -1052,19 +1055,17 @@ void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_spars } // If path doesn't exist and we haven't hit the max statistics size limit, // add it with count 1 - else if (count_map.size() < config::variant_max_sparse_column_statistics_size) { + else if (count_map.size() < max_sparse_column_statistics_size) { count_map.emplace(sparse_path, 1); } } } - if (stats->sparse_column_non_null_size().size() > - config::variant_max_sparse_column_statistics_size) { + if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) { throw doris::Exception( ErrorCode::INTERNAL_ERROR, "Sparse column non null size: {} is greater than max statistics size: {}", - stats->sparse_column_non_null_size().size(), - config::variant_max_sparse_column_statistics_size); + stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size); } } diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index e6d9d25b5b69b9..762787777ebfea 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -189,7 +189,8 @@ class VariantCompactionUtil { // Calculate statistics about variant data paths from the encoded sparse column static void calculate_variant_stats(const IColumn& encoded_sparse_column, - segment_v2::VariantStatisticsPB* stats, size_t row_pos, + segment_v2::VariantStatisticsPB* stats, + size_t max_sparse_column_statistics_size, size_t row_pos, size_t num_rows); static void get_compaction_subcolumns(TabletSchema::PathsSetInfo& paths_set_info, diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp index f041bee6f77f67..d4f3bb84dd73c3 100644 --- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -450,6 +450,8 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { subcolumn.set_parent_unique_id(parent_column.unique_id()); subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); subcolumn.set_is_nullable(true); + // default 10000 + size_t max_sparse_column_statistics_size = 10000; ColumnIteratorUPtr it1; st = variant_column_reader->new_iterator(&it1, &subcolumn, &storage_read_opts); EXPECT_TRUE(st.ok()) << st.msg(); @@ -458,16 +460,17 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { // 13. check statistics size == limit auto& variant_stats = variant_column_reader->_statistics; EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < - config::variant_max_sparse_column_statistics_size); - auto limit = config::variant_max_sparse_column_statistics_size - - variant_stats->sparse_column_non_null_size.size(); + max_sparse_column_statistics_size); + auto limit = + max_sparse_column_statistics_size - variant_stats->sparse_column_non_null_size.size(); for (int i = 0; i < limit; ++i) { std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); variant_stats->sparse_column_non_null_size[key] = 10000; } EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == - config::variant_max_sparse_column_statistics_size); - EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit()); + max_sparse_column_statistics_size); + EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit( + max_sparse_column_statistics_size)); ColumnIteratorUPtr it2; st = variant_column_reader->new_iterator(&it2, &subcolumn, &storage_read_opts); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index 3988ed1bb9a624..fe15adb1190cc8 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -1514,7 +1514,9 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns) { output_schema = std::make_shared(); sparse_paths.clear(); - for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1; ++i) { + // default 10000 + size_t max_sparse_column_statistics_size = 10000; + for (int i = 0; i < max_sparse_column_statistics_size + 1; ++i) { sparse_paths.insert("dummy" + std::to_string(i)); } schema_util::VariantCompactionUtil::get_compaction_subcolumns( @@ -1606,7 +1608,9 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_advanced) { output_schema = std::make_shared(); sparse_paths.clear(); - for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1; ++i) { + // default 10000 + size_t max_sparse_column_statistics_size = 10000; + for (int i = 0; i < max_sparse_column_statistics_size + 1; ++i) { sparse_paths.insert("dummy" + std::to_string(i)); } schema_util::VariantCompactionUtil::get_compaction_subcolumns( diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java index 1ca0c121814234..ee92c57c1d0f63 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java @@ -1226,4 +1226,13 @@ public boolean getVariantEnableTypedPathsToSparse() { } return false; // The old variant type had a default value of false. } + + public int getVariantMaxSparseColumnStatisticsSize() { + // In the past, variant metadata used the ScalarType type. + // Now, we use VariantType, which inherits from ScalarType, as the new metadata storage. + if (this instanceof VariantType) { + return ((VariantType) this).getVariantMaxSparseColumnStatisticsSize(); + } + return 0; // The old variant type had a default value of 0. + } } diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index 29342d73ca7c25..827b5403a5b112 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -46,6 +46,9 @@ public class VariantType extends ScalarType { @SerializedName(value = "enableTypedPathsToSparse") private boolean enableTypedPathsToSparse = false; + @SerializedName(value = "variantMaxSparseColumnStatisticsSize") + private int variantMaxSparseColumnStatisticsSize = 0; + private Map properties = Maps.newHashMap(); public VariantType() { @@ -53,6 +56,7 @@ public VariantType() { this.predefinedFields = Lists.newArrayList(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; + this.variantMaxSparseColumnStatisticsSize = 0; } public VariantType(ArrayList fields) { @@ -81,7 +85,8 @@ public VariantType(ArrayList fields, Map propertie } public VariantType(ArrayList fields, int variantMaxSubcolumnsCount, - boolean enableTypedPathsToSparse) { + boolean enableTypedPathsToSparse, + int variantMaxSparseColumnStatisticsSize) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); this.predefinedFields = fields; @@ -90,6 +95,7 @@ public VariantType(ArrayList fields, int variantMaxSubcolumnsCount } this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; + this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; } @Override @@ -123,6 +129,11 @@ public String toSql(int depth) { sb.append("\"variant_enable_typed_paths_to_sparse\" = \"") .append(String.valueOf(enableTypedPathsToSparse)).append("\""); } + if (variantMaxSparseColumnStatisticsSize != 0) { + sb.append(","); + sb.append("\"variant_max_sparse_column_statistics_size\" = \"") + .append(String.valueOf(variantMaxSparseColumnStatisticsSize)).append("\""); + } sb.append(")>"); return sb.toString(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java index 2b324d2ecd5534..bd4dfb43546f0d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java @@ -656,6 +656,7 @@ public TColumn toThrift() { } tColumn.setClusterKeyId(this.clusterKeyId); tColumn.setVariantEnableTypedPathsToSparse(this.getVariantEnableTypedPathsToSparse()); + tColumn.setVariantMaxSparseColumnStatisticsSize(this.getVariantMaxSparseColumnStatisticsSize()); // ATTN: // Currently, this `toThrift()` method is only used from CreateReplicaTask. // And CreateReplicaTask does not need `defineExpr` field. @@ -883,6 +884,7 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws } else if (this.type.isVariantType()) { builder.setVariantMaxSubcolumnsCount(this.getVariantMaxSubcolumnsCount()); builder.setVariantEnableTypedPathsToSparse(this.getVariantEnableTypedPathsToSparse()); + builder.setVariantMaxSparseColumnStatisticsSize(this.getVariantMaxSparseColumnStatisticsSize()); // variant may contain predefined structured fields addChildren(builder); } @@ -959,6 +961,9 @@ public void checkSchemaChangeAllowed(Column other) throws DdlException { if (this.getVariantEnableTypedPathsToSparse() != other.getVariantEnableTypedPathsToSparse()) { throw new DdlException("Can not change variant enable typed paths to sparse"); } + if (this.getVariantMaxSparseColumnStatisticsSize() != other.getVariantMaxSparseColumnStatisticsSize()) { + throw new DdlException("Can not change variant max sparse column statistics size"); + } if (!this.getChildren().isEmpty() || !other.getChildren().isEmpty()) { throw new DdlException("Can not change variant schema templates"); } @@ -1294,6 +1299,10 @@ public boolean getVariantEnableTypedPathsToSparse() { return type.isVariantType() ? ((ScalarType) type).getVariantEnableTypedPathsToSparse() : false; } + public int getVariantMaxSparseColumnStatisticsSize() { + return type.isVariantType() ? ((ScalarType) type).getVariantMaxSparseColumnStatisticsSize() : -1; + } + public void setFieldPatternType(TPatternType type) { fieldPatternType = type; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index 5ee0598e8e75f0..a274d8b77d3969 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -251,6 +251,9 @@ public class PropertyAnalyzer { public static final String PROPERTIES_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE = "variant_enable_typed_paths_to_sparse"; + public static final String PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE = + "variant_max_sparse_column_statistics_size"; + public enum RewriteType { PUT, // always put property REPLACE, // replace if exists property @@ -371,6 +374,12 @@ public static DataProperty analyzeDataProperty(Map properties, f newStoragePolicy = value; } else if (key.equalsIgnoreCase(PROPERTIES_IS_BEING_SYNCED)) { isBeingSynced = Boolean.parseBoolean(value); + } else if (key.equalsIgnoreCase(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE)) { + int variantMaxSparseColumnStatisticsSize = Integer.parseInt(value); + if (variantMaxSparseColumnStatisticsSize < 0) { + throw new AnalysisException("variant_max_sparse_column_statistics_size should be >= 0"); + } + properties.put(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, value); } } // end for properties @@ -1868,4 +1877,24 @@ public static boolean analyzeEnableTypedPathsToSparse(Map proper } return enableTypedPathsToSparse; } + + public static int analyzeVariantMaxSparseColumnStatisticsSize(Map properties, int defuatValue) + throws AnalysisException { + int maxSparseColumnStatisticsSize = defuatValue; + if (properties != null && properties.containsKey(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE)) { + String maxSparseColumnStatisticsSizeStr = + properties.get(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE); + try { + maxSparseColumnStatisticsSize = Integer.parseInt(maxSparseColumnStatisticsSizeStr); + if (maxSparseColumnStatisticsSize < 0 || maxSparseColumnStatisticsSize > 10000) { + throw new AnalysisException("variant_max_sparse_column_statistics_size must between 0 and 10000 "); + } + } catch (Exception e) { + throw new AnalysisException("variant_max_sparse_column_statistics_size format error"); + } + + properties.remove(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE); + } + return maxSparseColumnStatisticsSize; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index e5eecb1ea4cf3e..af751c39f2da25 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -4670,12 +4670,16 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) ConnectContext.get().getSessionVariable().getDefaultVariantMaxSubcolumnsCount(); boolean enableTypedPathsToSparse = ConnectContext.get() == null ? false : ConnectContext.get().getSessionVariable().getDefaultEnableTypedPathsToSparse(); + int variantMaxSparseColumnStatisticsSize = ConnectContext.get() == null ? 0 : + ConnectContext.get().getSessionVariable().getDefaultVariantMaxSparseColumnStatisticsSize(); try { variantMaxSubcolumnsCount = PropertyAnalyzer .analyzeVariantMaxSubcolumnsCount(properties, variantMaxSubcolumnsCount); enableTypedPathsToSparse = PropertyAnalyzer .analyzeEnableTypedPathsToSparse(properties, enableTypedPathsToSparse); + variantMaxSparseColumnStatisticsSize = PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize( + properties, variantMaxSparseColumnStatisticsSize); } catch (org.apache.doris.common.AnalysisException e) { throw new NotSupportedException(e.getMessage()); } @@ -4683,10 +4687,12 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) if (!properties.isEmpty()) { throw new NotSupportedException("only support for " + PropertyAnalyzer.PROPERTIES_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE - + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SUBCOLUMNS_COUNT); + + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SUBCOLUMNS_COUNT + + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE); } - return new VariantType(fields, variantMaxSubcolumnsCount, enableTypedPathsToSparse); + return new VariantType(fields, variantMaxSubcolumnsCount, enableTypedPathsToSparse, + variantMaxSparseColumnStatisticsSize); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java index b3e4c4e6f41ea3..381ee47dfa8db8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java @@ -38,7 +38,7 @@ public class GetVariantType extends ScalarFunction implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNullable { public static final List SIGNATURES = ImmutableList.of( - FunctionSignature.ret(StringType.INSTANCE).args(new VariantType(0)) + FunctionSignature.ret(StringType.INSTANCE).args(new VariantType(0, 0)) ); /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index 8844278947398f..c15153cb28efbc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -451,7 +451,8 @@ public static DataType fromCatalogType(Type type) { .collect(ImmutableList.toImmutableList()); return new VariantType(variantFields, ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount(), - ((org.apache.doris.catalog.VariantType) type).getEnableTypedPathsToSparse()); + ((org.apache.doris.catalog.VariantType) type).getEnableTypedPathsToSparse(), + ((org.apache.doris.catalog.VariantType) type).getVariantMaxSparseColumnStatisticsSize()); } return VariantType.INSTANCE; } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index aa17c3b4292506..974becb9ea3622 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -38,7 +38,7 @@ */ public class VariantType extends PrimitiveType { - public static final VariantType INSTANCE = new VariantType(0); + public static final VariantType INSTANCE = new VariantType(0, 0); public static final int WIDTH = 24; @@ -46,13 +46,16 @@ public class VariantType extends PrimitiveType { private final boolean enableTypedPathsToSparse; + private final int variantMaxSparseColumnStatisticsSize; + private final List predefinedFields; // No predefined fields - public VariantType(int variantMaxSubcolumnsCount) { + public VariantType(int variantMaxSubcolumnsCount, int variantMaxSparseColumnStatisticsSize) { this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.predefinedFields = Lists.newArrayList(); this.enableTypedPathsToSparse = false; + this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; } /** @@ -62,25 +65,30 @@ public VariantType(List fields) { this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; + this.variantMaxSparseColumnStatisticsSize = 0; } - public VariantType(List fields, int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse) { + public VariantType(List fields, int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, + int variantMaxSparseColumnStatisticsSize) { this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; + this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; } @Override public DataType conversion() { return new VariantType(predefinedFields.stream().map(VariantField::conversion) - .collect(Collectors.toList()), variantMaxSubcolumnsCount, enableTypedPathsToSparse); + .collect(Collectors.toList()), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + variantMaxSparseColumnStatisticsSize); } @Override public Type toCatalogDataType() { org.apache.doris.catalog.VariantType type = new org.apache.doris.catalog.VariantType(predefinedFields.stream() .map(VariantField::toCatalogDataType) - .collect(Collectors.toCollection(ArrayList::new)), variantMaxSubcolumnsCount, enableTypedPathsToSparse); + .collect(Collectors.toCollection(ArrayList::new)), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + variantMaxSparseColumnStatisticsSize); return type; } @@ -119,6 +127,12 @@ public String toSql() { sb.append("\"variant_enable_typed_paths_to_sparse\" = \"") .append(String.valueOf(enableTypedPathsToSparse)).append("\""); } + if (variantMaxSparseColumnStatisticsSize != 0) { + sb.append(","); + sb.append("\"variant_max_sparse_column_statistics_size\" = \"") + .append(String.valueOf(variantMaxSparseColumnStatisticsSize)) + .append("\""); + } sb.append(")>"); return sb.toString(); } @@ -134,12 +148,14 @@ public boolean equals(Object o) { VariantType other = (VariantType) o; return this.variantMaxSubcolumnsCount == other.variantMaxSubcolumnsCount && this.enableTypedPathsToSparse == other.enableTypedPathsToSparse + && this.variantMaxSparseColumnStatisticsSize == other.variantMaxSparseColumnStatisticsSize && Objects.equals(predefinedFields, other.predefinedFields); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), variantMaxSubcolumnsCount, enableTypedPathsToSparse, predefinedFields); + return Objects.hash(super.hashCode(), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + variantMaxSparseColumnStatisticsSize, predefinedFields); } @Override @@ -159,4 +175,8 @@ public List getPredefinedFields() { public int getVariantMaxSubcolumnsCount() { return variantMaxSubcolumnsCount; } + + public int getVariantMaxSparseColumnStatisticsSize() { + return variantMaxSparseColumnStatisticsSize; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 9e402ef1d5ed54..607a6be0bb2c25 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -792,6 +792,9 @@ public static double getHotValueThreshold() { public static final String DEFAULT_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE = "default_variant_enable_typed_paths_to_sparse"; + public static final String DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE = + "default_variant_max_sparse_column_statistics_size"; + /** * If set false, user couldn't submit analyze SQL and FE won't allocate any related resources. */ @@ -2805,6 +2808,13 @@ public boolean isEnableESParallelScroll() { ) public boolean defaultEnableTypedPathsToSparse = false; + @VariableMgr.VarAttr( + name = DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, + needForward = true, + fuzzy = true + ) + public int defaultVariantMaxSparseColumnStatisticsSize = 10000; + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. @SuppressWarnings("checkstyle:Indentation") @@ -5153,6 +5163,10 @@ public int getDefaultVariantMaxSubcolumnsCount() { return defaultVariantMaxSubcolumnsCount; } + public int getDefaultVariantMaxSparseColumnStatisticsSize() { + return defaultVariantMaxSparseColumnStatisticsSize; + } + public static boolean isFeDebug() { if (ConnectContext.get() != null) { return ConnectContext.get().getSessionVariable().feDebug; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java index 17fcd1a21a4439..01f6bfbdcbc962 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java @@ -532,7 +532,7 @@ void testNoDynamicComputeVariantArgs() { @Test void testDynamicComputeVariantArgsSingleVariant() { - VariantType variantType = new VariantType(100); + VariantType variantType = new VariantType(100, 100); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE, IntegerType.INSTANCE); @@ -544,17 +544,19 @@ void testDynamicComputeVariantArgsSingleVariant() { Assertions.assertTrue(signature.returnType instanceof VariantType); Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); } @Test void testDynamicComputeVariantArgsMultipleVariants() { - VariantType variantType1 = new VariantType(150); - VariantType variantType2 = new VariantType(250); + VariantType variantType1 = new VariantType(150, 100); + VariantType variantType2 = new VariantType(250, 100); FunctionSignature signature = FunctionSignature.ret(IntegerType.INSTANCE) .args(VariantType.INSTANCE, VariantType.INSTANCE); @@ -566,14 +568,16 @@ void testDynamicComputeVariantArgsMultipleVariants() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(150, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof VariantType); Assertions.assertEquals(250, ((VariantType) signature.getArgType(1)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(1)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.returnType instanceof IntegerType); } @Test void testDynamicComputeVariantArgsMixedTypesWithSingleVariant() { - VariantType variantType = new VariantType(75); + VariantType variantType = new VariantType(75, 100); FunctionSignature signature = FunctionSignature.ret(BooleanType.INSTANCE) .args(VariantType.INSTANCE, IntegerType.INSTANCE, DoubleType.INSTANCE); @@ -586,7 +590,7 @@ void testDynamicComputeVariantArgsMixedTypesWithSingleVariant() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(75, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); Assertions.assertTrue(signature.getArgType(2) instanceof DoubleType); @@ -606,12 +610,13 @@ void testDynamicComputeVariantArgsWithNullLiteral() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); } @Test void testDynamicComputeVariantArgsNoVariantReturnType() { - VariantType variantType = new VariantType(300); + VariantType variantType = new VariantType(300, 100); FunctionSignature signature = FunctionSignature.ret(IntegerType.INSTANCE) .args(VariantType.INSTANCE); @@ -624,12 +629,13 @@ void testDynamicComputeVariantArgsNoVariantReturnType() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(300, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); } @Test void testDynamicComputeVariantArgsWithVarArgsThrowsException() { - VariantType variantType1 = new VariantType(150); - VariantType variantType2 = new VariantType(250); + VariantType variantType1 = new VariantType(150, 100); + VariantType variantType2 = new VariantType(250, 100); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE, VariantType.INSTANCE); @@ -646,7 +652,7 @@ void testDynamicComputeVariantArgsWithVarArgsThrowsException() { @Test void testDynamicComputeVariantArgsWithComputeSignature() { - VariantType variantType = new VariantType(200); + VariantType variantType = new VariantType(200, 100); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE); @@ -657,8 +663,10 @@ void testDynamicComputeVariantArgsWithComputeSignature() { Assertions.assertTrue(signature.returnType instanceof VariantType); Assertions.assertEquals(200, ((VariantType) signature.returnType).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(200, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); + Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); } /** diff --git a/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java b/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java index b1f2039e356348..3fac71bfc33d2d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java @@ -36,5 +36,6 @@ public void testScalarType() { Assert.assertEquals(scalarType.getPrimitiveType(), scalarType2.getPrimitiveType()); Assert.assertEquals(scalarType.getVariantMaxSubcolumnsCount(), 0); Assert.assertEquals(scalarType.getVariantEnableTypedPathsToSparse(), false); + Assert.assertEquals(scalarType.getVariantMaxSparseColumnStatisticsSize(), 0); } } diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 165ab898a22bb9..1d37105bf67ccf 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -354,6 +354,8 @@ message ColumnPB { optional bool variant_enable_typed_paths_to_sparse = 27 [default = false]; // this field is only used during flexible partial update load optional bool is_on_update_current_timestamp = 28 [default = false]; + // variant_max_sparse_column_statistics_size + optional int32 variant_max_sparse_column_statistics_size = 29 [default = 10000]; } // Dictionary of Schema info, to reduce TabletSchemaCloudPB fdb kv size diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 84a2a70a666e14..d7a9fafc431805 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -51,6 +51,7 @@ struct TColumn { 21: optional TPatternType pattern_type 22: optional bool variant_enable_typed_paths_to_sparse = false 23: optional bool is_on_update_current_timestamp = false + 24: optional i32 variant_max_sparse_column_statistics_size = -1 } struct TSlotDescriptor { diff --git a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy index 5b32eb67a02c7e..48f4279162ba5c 100644 --- a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy @@ -45,17 +45,17 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { } } - set_be_config("variant_max_sparse_column_statistics_size", "2") - int max_subcolumns_count = Math.floor(Math.random() * 5) + int max_subcolumns_count = Math.floor(Math.random() * 5) + int max_sparse_column_statistics_size = 2 if (max_subcolumns_count == 1) { max_subcolumns_count = 0 } def create_table = { tableName, buckets="auto", key_type="DUPLICATE" -> sql "DROP TABLE IF EXISTS ${tableName}" - def var_def = "variant " + def var_def = "variant " if (key_type == "AGGREGATE") { - var_def = "variant replace" + var_def = "variant replace" } sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( @@ -134,7 +134,5 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { } } finally { - // set back to default - set_be_config("variant_max_sparse_column_statistics_size", "10000") } } diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index 0ab363d5671c72..485774a698f7c9 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -45,12 +45,12 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { } } - set_be_config("variant_max_sparse_column_statistics_size", "2") + int max_sparse_column_statistics_size = 2 def create_table = { tableName, buckets="auto", key_type="DUPLICATE" -> sql "DROP TABLE IF EXISTS ${tableName}" - def var_def = "variant <'sala' : int, 'ddd' : double, 'z' : double>" + def var_def = "variant " if (key_type == "AGGREGATE") { - var_def = "variant <'sala' : int, 'ddd' : double, 'z' : double> replace" + var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double> replace" } sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( From 4eb037fcb17ae74eea3db371d8f09679f088f1bf Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Thu, 21 Aug 2025 18:52:18 +0800 Subject: [PATCH 02/16] update variantType --- .../main/java/org/apache/doris/catalog/VariantType.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index 827b5403a5b112..ce809a1efdf8a2 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -199,4 +199,12 @@ public Map getProperties() { public void setEnableTypedPathsToSparse(boolean enableTypedPathsToSparse) { this.enableTypedPathsToSparse = enableTypedPathsToSparse; } + + public int getVariantMaxSparseColumnStatisticsSize() { + return variantMaxSparseColumnStatisticsSize; + } + + public void setVariantMaxSparseColumnStatisticsSize(int variantMaxSparseColumnStatisticsSize) { + this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; + } } From b239209e6c6a40225bcb7d9111a33f5814eb1b52 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Mon, 25 Aug 2025 14:32:33 +0800 Subject: [PATCH 03/16] fix comment --- .../doris/common/util/PropertyAnalyzer.java | 10 ++---- .../functions/scalar/GetVariantType.java | 2 +- .../doris/nereids/types/VariantType.java | 6 ++-- .../functions/ComputeSignatureHelperTest.java | 34 +++++++++---------- ...ariant_compaction_with_sparse_limit.groovy | 6 ---- ...ariant_compaction_with_sparse_limit.groovy | 13 +++---- 6 files changed, 27 insertions(+), 44 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index a274d8b77d3969..5599f0c2fa74dc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -374,12 +374,6 @@ public static DataProperty analyzeDataProperty(Map properties, f newStoragePolicy = value; } else if (key.equalsIgnoreCase(PROPERTIES_IS_BEING_SYNCED)) { isBeingSynced = Boolean.parseBoolean(value); - } else if (key.equalsIgnoreCase(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE)) { - int variantMaxSparseColumnStatisticsSize = Integer.parseInt(value); - if (variantMaxSparseColumnStatisticsSize < 0) { - throw new AnalysisException("variant_max_sparse_column_statistics_size should be >= 0"); - } - properties.put(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, value); } } // end for properties @@ -1886,8 +1880,8 @@ public static int analyzeVariantMaxSparseColumnStatisticsSize(Map 10000) { - throw new AnalysisException("variant_max_sparse_column_statistics_size must between 0 and 10000 "); + if (maxSparseColumnStatisticsSize < 0 || maxSparseColumnStatisticsSize > 50000) { + throw new AnalysisException("variant_max_sparse_column_statistics_size must between 0 and 50000 "); } } catch (Exception e) { throw new AnalysisException("variant_max_sparse_column_statistics_size format error"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java index 381ee47dfa8db8..b3e4c4e6f41ea3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/GetVariantType.java @@ -38,7 +38,7 @@ public class GetVariantType extends ScalarFunction implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNullable { public static final List SIGNATURES = ImmutableList.of( - FunctionSignature.ret(StringType.INSTANCE).args(new VariantType(0, 0)) + FunctionSignature.ret(StringType.INSTANCE).args(new VariantType(0)) ); /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index 974becb9ea3622..13133887df07ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -38,7 +38,7 @@ */ public class VariantType extends PrimitiveType { - public static final VariantType INSTANCE = new VariantType(0, 0); + public static final VariantType INSTANCE = new VariantType(0); public static final int WIDTH = 24; @@ -51,11 +51,11 @@ public class VariantType extends PrimitiveType { private final List predefinedFields; // No predefined fields - public VariantType(int variantMaxSubcolumnsCount, int variantMaxSparseColumnStatisticsSize) { + public VariantType(int variantMaxSubcolumnsCount) { this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.predefinedFields = Lists.newArrayList(); this.enableTypedPathsToSparse = false; - this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; + this.variantMaxSparseColumnStatisticsSize = 0; } /** diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java index 01f6bfbdcbc962..2dc2447c97d79e 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/ComputeSignatureHelperTest.java @@ -532,7 +532,7 @@ void testNoDynamicComputeVariantArgs() { @Test void testDynamicComputeVariantArgsSingleVariant() { - VariantType variantType = new VariantType(100, 100); + VariantType variantType = new VariantType(100); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE, IntegerType.INSTANCE); @@ -544,19 +544,19 @@ void testDynamicComputeVariantArgsSingleVariant() { Assertions.assertTrue(signature.returnType instanceof VariantType); Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); } @Test void testDynamicComputeVariantArgsMultipleVariants() { - VariantType variantType1 = new VariantType(150, 100); - VariantType variantType2 = new VariantType(250, 100); + VariantType variantType1 = new VariantType(150); + VariantType variantType2 = new VariantType(250); FunctionSignature signature = FunctionSignature.ret(IntegerType.INSTANCE) .args(VariantType.INSTANCE, VariantType.INSTANCE); @@ -568,16 +568,16 @@ void testDynamicComputeVariantArgsMultipleVariants() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(150, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof VariantType); Assertions.assertEquals(250, ((VariantType) signature.getArgType(1)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(1)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(1)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.returnType instanceof IntegerType); } @Test void testDynamicComputeVariantArgsMixedTypesWithSingleVariant() { - VariantType variantType = new VariantType(75, 100); + VariantType variantType = new VariantType(75); FunctionSignature signature = FunctionSignature.ret(BooleanType.INSTANCE) .args(VariantType.INSTANCE, IntegerType.INSTANCE, DoubleType.INSTANCE); @@ -590,7 +590,7 @@ void testDynamicComputeVariantArgsMixedTypesWithSingleVariant() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(75, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); Assertions.assertTrue(signature.getArgType(2) instanceof DoubleType); @@ -610,13 +610,13 @@ void testDynamicComputeVariantArgsWithNullLiteral() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(1) instanceof IntegerType); } @Test void testDynamicComputeVariantArgsNoVariantReturnType() { - VariantType variantType = new VariantType(300, 100); + VariantType variantType = new VariantType(300); FunctionSignature signature = FunctionSignature.ret(IntegerType.INSTANCE) .args(VariantType.INSTANCE); @@ -629,13 +629,13 @@ void testDynamicComputeVariantArgsNoVariantReturnType() { Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(300, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); } @Test void testDynamicComputeVariantArgsWithVarArgsThrowsException() { - VariantType variantType1 = new VariantType(150, 100); - VariantType variantType2 = new VariantType(250, 100); + VariantType variantType1 = new VariantType(150); + VariantType variantType2 = new VariantType(250); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE, VariantType.INSTANCE); @@ -652,7 +652,7 @@ void testDynamicComputeVariantArgsWithVarArgsThrowsException() { @Test void testDynamicComputeVariantArgsWithComputeSignature() { - VariantType variantType = new VariantType(200, 100); + VariantType variantType = new VariantType(200); FunctionSignature signature = FunctionSignature.ret(VariantType.INSTANCE) .args(VariantType.INSTANCE); @@ -663,10 +663,10 @@ void testDynamicComputeVariantArgsWithComputeSignature() { Assertions.assertTrue(signature.returnType instanceof VariantType); Assertions.assertEquals(200, ((VariantType) signature.returnType).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.returnType).getVariantMaxSparseColumnStatisticsSize()); Assertions.assertTrue(signature.getArgType(0) instanceof VariantType); Assertions.assertEquals(200, ((VariantType) signature.getArgType(0)).getVariantMaxSubcolumnsCount()); - Assertions.assertEquals(100, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); + Assertions.assertEquals(0, ((VariantType) signature.getArgType(0)).getVariantMaxSparseColumnStatisticsSize()); } /** diff --git a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy index 48f4279162ba5c..d1a4add7ba9e07 100644 --- a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy @@ -23,12 +23,6 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { def backendId_to_backendHttpPort = [:] getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort); - def set_be_config = { key, value -> - for (String backend_id: backendId_to_backendIP.keySet()) { - def (code, out, err) = update_be_config(backendId_to_backendIP.get(backend_id), backendId_to_backendHttpPort.get(backend_id), key, value) - logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) - } - } try { String backend_id = backendId_to_backendIP.keySet()[0] def (code, out, err) = show_be_config(backendId_to_backendIP.get(backend_id), backendId_to_backendHttpPort.get(backend_id)) diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index 485774a698f7c9..a6eed431cd6ec7 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -23,12 +23,6 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { def backendId_to_backendHttpPort = [:] getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort); - def set_be_config = { key, value -> - for (String backend_id: backendId_to_backendIP.keySet()) { - def (code, out, err) = update_be_config(backendId_to_backendIP.get(backend_id), backendId_to_backendHttpPort.get(backend_id), key, value) - logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) - } - } try { String backend_id = backendId_to_backendIP.keySet()[0] def (code, out, err) = show_be_config(backendId_to_backendIP.get(backend_id), backendId_to_backendHttpPort.get(backend_id)) @@ -52,6 +46,10 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { if (key_type == "AGGREGATE") { var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double> replace" } + def create_tbl_res = sql """ show create table ${tableName} """ + logger.info("${create_tbl_res}") + assertTrue(create_tbl_res.toString().contains("variant_max_sparse_column_statistics_size")) + sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( k bigint, @@ -131,8 +129,5 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { sql "set topn_opt_limit_threshold = 10" order_qt_select "select * from ${tableName} order by k, cast(v as string) limit 5;" } - } finally { - // set back to default - set_be_config("variant_max_sparse_column_statistics_size", "10000") } } From 96ba2ec80545d90472fcccce3633bf8a3386e610 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Mon, 25 Aug 2025 16:39:31 +0800 Subject: [PATCH 04/16] fix be cofig usage --- be/src/olap/rowset/segment_v2/segment.cpp | 3 +-- .../variant/variant_column_reader.cpp | 26 ++++++++++++++----- .../variant/variant_column_reader.h | 4 ++- .../segment_v2/variant_stats_calculator.cpp | 11 +++++--- .../variant_column_writer_reader_test.cpp | 14 +++++----- be/test/vec/common/schema_util_test.cpp | 11 +++----- 6 files changed, 40 insertions(+), 29 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index b7d6a9493829f1..bff825780129db 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -629,8 +629,7 @@ vectorized::DataTypePtr Segment::get_data_type_of(const TabletColumn& column, // 2. OR It's a leaf in the physical column structure AND it doesn't *also* exist // in the sparse column (meaning it's purely a materialized leaf). if (read_flat_leaves || (is_physical_leaf && !exist_in_sparse && - !variant_reader->is_exceeded_sparse_column_limit( - column.variant_max_sparse_column_statistics_size()))) { + !variant_reader->is_exceeded_sparse_column_limit())) { return node->data.file_column_type; } return column.is_nullable() diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp index 5eec534a702c76..699db7e7324de9 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -74,10 +74,10 @@ bool VariantColumnReader::exist_in_sparse_column( return existed_in_sparse_column || prefix_existed_in_sparse_column; } -bool VariantColumnReader::is_exceeded_sparse_column_limit( - size_t max_sparse_column_statistics_size) const { +bool VariantColumnReader::is_exceeded_sparse_column_limit() const { return !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= max_sparse_column_statistics_size; + _statistics->sparse_column_non_null_size.size() >= + _variant_sparse_column_statistics_size; } int64_t VariantColumnReader::get_metadata_size() const { @@ -276,10 +276,10 @@ Status VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator, // Otherwise the prefix is not exist and the sparse column size is reached limit // which means the path maybe exist in sparse_column - bool exceeded_sparse_column_limit = - !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= - target_col->variant_max_sparse_column_statistics_size(); + + bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() && + _statistics->sparse_column_non_null_size.size() >= + _variant_sparse_column_statistics_size; // If the variant column has extracted columns and is a compaction reader, then read flat leaves // Otherwise read hierarchical data, since the variant subcolumns are flattened in schema_util::VariantCompactionUtil::get_extended_compaction_schema @@ -379,6 +379,18 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF vectorized::PathInData path; path.from_protobuf(column_pb.column_path_info()); + // record variant_sparse_column_statistics_size from parent column + if (column_pb.column_path_info().parrent_column_unique_id() != -1) { + _variant_sparse_column_statistics_size = + opts.tablet_schema + ->column_by_uid(column_pb.column_path_info().parrent_column_unique_id()) + .variant_max_sparse_column_statistics_size(); + } else { + _variant_sparse_column_statistics_size = + opts.tablet_schema->column_by_uid(column_pb.unique_id()) + .variant_max_sparse_column_statistics_size(); + } + // init sparse column if (path.copy_pop_front().get_path() == SPARSE_COLUMN_PATH) { DCHECK(column_pb.has_variant_statistics()) << column_pb.DebugString(); diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h index ee7abba7c4b707..e7d820dc42f283 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h @@ -72,7 +72,7 @@ class VariantColumnReader : public ColumnReader { bool exist_in_sparse_column(const vectorized::PathInData& path) const; - bool is_exceeded_sparse_column_limit(size_t max_sparse_column_statistics_size) const; + bool is_exceeded_sparse_column_limit() const; const SubcolumnColumnReaders* get_subcolumn_readers() const { return _subcolumn_readers.get(); } @@ -105,6 +105,8 @@ class VariantColumnReader : public ColumnReader { std::unique_ptr _statistics; // key: subcolumn path, value: subcolumn indexes std::unordered_map _variant_subcolumns_indexes; + // variant_sparse_column_statistics_size + size_t _variant_sparse_column_statistics_size = 0; }; class VariantRootColumnIterator : public ColumnIterator { diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp index 7567d0ff588d1c..baa57a08c0444c 100644 --- a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp +++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp @@ -65,10 +65,13 @@ Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* b // Check if this is a sparse column or sub column if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) { // This is a sparse column from variant column - _calculate_sparse_column_stats( - *column, column_meta, - tablet_column.variant_max_sparse_column_statistics_size(), row_pos, - num_rows); + // get variant_max_sparse_column_statistics_size from tablet_schema + size_t variant_max_sparse_column_statistics_size = + _tablet_schema->column_by_uid(tablet_column.parent_unique_id()) + .variant_max_sparse_column_statistics_size(); + _calculate_sparse_column_stats(*column, column_meta, + variant_max_sparse_column_statistics_size, row_pos, + num_rows); } else { // This is a sub column from variant column _calculate_sub_column_stats(*column, column_meta, row_pos, num_rows); diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp index d4f3bb84dd73c3..910f166d633a29 100644 --- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -44,6 +44,7 @@ static void construct_column(ColumnPB* column_pb, int32_t col_unique_id, column_pb->set_is_nullable(is_nullable); if (column_type == "VARIANT") { column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count); + column_pb->set_variant_max_sparse_column_statistics_size(10000); } } @@ -450,8 +451,6 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { subcolumn.set_parent_unique_id(parent_column.unique_id()); subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); subcolumn.set_is_nullable(true); - // default 10000 - size_t max_sparse_column_statistics_size = 10000; ColumnIteratorUPtr it1; st = variant_column_reader->new_iterator(&it1, &subcolumn, &storage_read_opts); EXPECT_TRUE(st.ok()) << st.msg(); @@ -460,17 +459,16 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { // 13. check statistics size == limit auto& variant_stats = variant_column_reader->_statistics; EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < - max_sparse_column_statistics_size); - auto limit = - max_sparse_column_statistics_size - variant_stats->sparse_column_non_null_size.size(); + variant_column_reader->_variant_sparse_column_statistics_size); + auto limit = variant_column_reader->_variant_sparse_column_statistics_size - + variant_stats->sparse_column_non_null_size.size(); for (int i = 0; i < limit; ++i) { std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); variant_stats->sparse_column_non_null_size[key] = 10000; } EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == - max_sparse_column_statistics_size); - EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit( - max_sparse_column_statistics_size)); + variant_column_reader->_variant_sparse_column_statistics_size); + EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit()); ColumnIteratorUPtr it2; st = variant_column_reader->new_iterator(&it2, &subcolumn, &storage_read_opts); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index fe15adb1190cc8..ae85f819fabebb 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -1457,7 +1457,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns) { variant.set_unique_id(30); variant.set_variant_max_subcolumns_count(3); variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE); - + variant.set_variant_max_sparse_column_statistics_size(10000); TabletSchemaSPtr schema = std::make_shared(); schema->append_column(variant); @@ -1514,9 +1514,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns) { output_schema = std::make_shared(); sparse_paths.clear(); - // default 10000 - size_t max_sparse_column_statistics_size = 10000; - for (int i = 0; i < max_sparse_column_statistics_size + 1; ++i) { + for (int i = 0; i < variant.variant_max_sparse_column_statistics_size() + 1; ++i) { sparse_paths.insert("dummy" + std::to_string(i)); } schema_util::VariantCompactionUtil::get_compaction_subcolumns( @@ -1533,6 +1531,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_advanced) { variant.set_variant_max_subcolumns_count(3); variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE); variant.set_variant_enable_typed_paths_to_sparse(true); + variant.set_variant_max_sparse_column_statistics_size(10000); TabletColumn subcolumn; subcolumn.set_name("c"); subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_DATEV2); @@ -1608,9 +1607,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns_advanced) { output_schema = std::make_shared(); sparse_paths.clear(); - // default 10000 - size_t max_sparse_column_statistics_size = 10000; - for (int i = 0; i < max_sparse_column_statistics_size + 1; ++i) { + for (int i = 0; i < variant.variant_max_sparse_column_statistics_size() + 1; ++i) { sparse_paths.insert("dummy" + std::to_string(i)); } schema_util::VariantCompactionUtil::get_compaction_subcolumns( From 81fbbdbbf4f2b7b1ded81df618e9edfdc9f113d5 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Tue, 26 Aug 2025 09:45:07 +0800 Subject: [PATCH 05/16] fix default value --- .../variant/variant_column_reader.cpp | 24 +++++++++---------- be/src/olap/tablet_schema.h | 2 +- gensrc/thrift/Descriptors.thrift | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp index 699db7e7324de9..18d1efc89b1f01 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -354,6 +354,18 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF _statistics = std::make_unique(); const ColumnMetaPB& self_column_pb = footer.columns(column_id); const auto& parent_index = opts.tablet_schema->inverted_indexs(self_column_pb.unique_id()); + // record variant_sparse_column_statistics_size from parent column + if (self_column_pb.column_path_info().parrent_column_unique_id() != -1) { + _variant_sparse_column_statistics_size = + opts.tablet_schema + ->column_by_uid( + self_column_pb.column_path_info().parrent_column_unique_id()) + .variant_max_sparse_column_statistics_size(); + } else { + _variant_sparse_column_statistics_size = + opts.tablet_schema->column_by_uid(self_column_pb.unique_id()) + .variant_max_sparse_column_statistics_size(); + } for (const ColumnMetaPB& column_pb : footer.columns()) { // Find all columns belonging to the current variant column // 1. not the variant column @@ -379,18 +391,6 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF vectorized::PathInData path; path.from_protobuf(column_pb.column_path_info()); - // record variant_sparse_column_statistics_size from parent column - if (column_pb.column_path_info().parrent_column_unique_id() != -1) { - _variant_sparse_column_statistics_size = - opts.tablet_schema - ->column_by_uid(column_pb.column_path_info().parrent_column_unique_id()) - .variant_max_sparse_column_statistics_size(); - } else { - _variant_sparse_column_statistics_size = - opts.tablet_schema->column_by_uid(column_pb.unique_id()) - .variant_max_sparse_column_statistics_size(); - } - // init sparse column if (path.copy_pop_front().get_path() == SPARSE_COLUMN_PATH) { DCHECK(column_pb.has_variant_statistics()) << column_pb.DebugString(); diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 3e1c1e3f3eb2b3..f6eebf483318e7 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -285,7 +285,7 @@ class TabletColumn : public MetadataAdder { PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB; bool _variant_enable_typed_paths_to_sparse = false; // set variant_max_sparse_column_statistics_size - int32_t _variant_max_sparse_column_statistics_size = 0; + int32_t _variant_max_sparse_column_statistics_size = 10000; }; bool operator==(const TabletColumn& a, const TabletColumn& b); diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index d7a9fafc431805..af15d190efe255 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -51,7 +51,7 @@ struct TColumn { 21: optional TPatternType pattern_type 22: optional bool variant_enable_typed_paths_to_sparse = false 23: optional bool is_on_update_current_timestamp = false - 24: optional i32 variant_max_sparse_column_statistics_size = -1 + 24: optional i32 variant_max_sparse_column_statistics_size = 10000 } struct TSlotDescriptor { From bbfdb345fbb32190f13ac47af604cf75a4080a1f Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Tue, 26 Aug 2025 20:36:30 +0800 Subject: [PATCH 06/16] add debug point --- .../variant/variant_column_reader.cpp | 36 ++++++++-------- ...ariant_compaction_with_sparse_limit.groovy | 42 +++++++++++++++++++ ...ariant_compaction_with_sparse_limit.groovy | 4 ++ 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp index 18d1efc89b1f01..1ef028ead9be8f 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -75,9 +75,19 @@ bool VariantColumnReader::exist_in_sparse_column( } bool VariantColumnReader::is_exceeded_sparse_column_limit() const { - return !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= - _variant_sparse_column_statistics_size; + bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() && + _statistics->sparse_column_non_null_size.size() >= + _variant_sparse_column_statistics_size; + DBUG_EXECUTE_IF("exceeded_sparse_column_limit_must_be_false", { + if (exceeded_sparse_column_limit) { + return Status::Error( + "exceeded_sparse_column_limit_must_be_false, sparse_column_non_null_size: {} : " + " _variant_sparse_column_statistics_size: {}", + _statistics->sparse_column_non_null_size.size(), + _variant_sparse_column_statistics_size); + } + }) + return exceeded_sparse_column_limit; } int64_t VariantColumnReader::get_metadata_size() const { @@ -276,10 +286,7 @@ Status VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator, // Otherwise the prefix is not exist and the sparse column size is reached limit // which means the path maybe exist in sparse_column - - bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() >= - _variant_sparse_column_statistics_size; + bool exceeded_sparse_column_limit = is_exceeded_sparse_column_limit(); // If the variant column has extracted columns and is a compaction reader, then read flat leaves // Otherwise read hierarchical data, since the variant subcolumns are flattened in schema_util::VariantCompactionUtil::get_extended_compaction_schema @@ -355,17 +362,10 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF const ColumnMetaPB& self_column_pb = footer.columns(column_id); const auto& parent_index = opts.tablet_schema->inverted_indexs(self_column_pb.unique_id()); // record variant_sparse_column_statistics_size from parent column - if (self_column_pb.column_path_info().parrent_column_unique_id() != -1) { - _variant_sparse_column_statistics_size = - opts.tablet_schema - ->column_by_uid( - self_column_pb.column_path_info().parrent_column_unique_id()) - .variant_max_sparse_column_statistics_size(); - } else { - _variant_sparse_column_statistics_size = - opts.tablet_schema->column_by_uid(self_column_pb.unique_id()) - .variant_max_sparse_column_statistics_size(); - } + _variant_sparse_column_statistics_size = + opts.tablet_schema->column_by_uid(self_column_pb.unique_id()) + .variant_max_sparse_column_statistics_size(); + for (const ColumnMetaPB& column_pb : footer.columns()) { // Find all columns belonging to the current variant column // 1. not the variant column diff --git a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy index d1a4add7ba9e07..c8aa11a14471e9 100644 --- a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy @@ -70,6 +70,14 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { } finally { GetDebugPoint().disableDebugPointForAllBEs("exist_in_sparse_column_must_be_false") } + } else if (max_subcolumns_count > 1) { + // here will aways false + try { + GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + sql """ select v['mmm'] from ${tableName} where k = 30""" + } finally { + GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + } } } def key_types = ["DUPLICATE", "UNIQUE", "AGGREGATE"] @@ -127,6 +135,40 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { qt_sql_55 "select cast(v['b'] as string), cast(v['b']['c'] as string) from ${tableName} where cast(v['b'] as string) != 'null' and cast(v['b'] as string) != '{}' order by k desc limit 10;" } + } catch (e) { + logger.info("catch exception: ${e}") + } finally { + sql "DROP TABLE IF EXISTS simple_variant_DUPLICATE" + sql "DROP TABLE IF EXISTS simple_variant_UNIQUE" + sql "DROP TABLE IF EXISTS simple_variant_AGGREGATE" + } + + // test variant_max_sparse_column_statistics_size debug error case + sql "DROP TABLE IF EXISTS tn_simple_variant_DUPLICATE" + sql """ + CREATE TABLE IF NOT EXISTS tn_simple_variant_DUPLICATE ( + k bigint, + v variant + ) + DUPLICATE KEY(`k`) + DISTRIBUTED BY HASH(k) BUCKETS 1 + properties("replication_num" = "1", "disable_auto_compaction" = "true"); + """ + sql """insert into tn_simple_variant_DUPLICATE values (1, '{"a" : 1, "b" : 2}');""" + sql """insert into tn_simple_variant_DUPLICATE values (2, '{"d" : "ddd", "s" : "fff", "m": 111}');""" + // here will aways false + try { + GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + sql """ select v['a'] from tn_simple_variant_DUPLICATE where k = 1""" + } finally { + GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + } + try { + GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + sql """ select v['m'] from tn_simple_variant_DUPLICATE where k = 2""" + } catch (e) { + logger.info("catch exception: ${e}") } finally { + GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") } } diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index a6eed431cd6ec7..e29bbaa4fca334 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -129,5 +129,9 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { sql "set topn_opt_limit_threshold = 10" order_qt_select "select * from ${tableName} order by k, cast(v as string) limit 5;" } + } finally { + sql "DROP TABLE IF EXISTS simple_variant_DUPLICATE" + sql "DROP TABLE IF EXISTS simple_variant_UNIQUE" + sql "DROP TABLE IF EXISTS simple_variant_AGGREGATE" } } From 3f729d54cae08f8e319b741f73c51cac3a0d50b2 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Wed, 27 Aug 2025 17:18:08 +0800 Subject: [PATCH 07/16] fix tablet_schema --- be/src/vec/common/schema_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index f8c7fdb1f13727..2ba4c09d8c8ccd 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -807,7 +807,7 @@ Status VariantCompactionUtil::check_path_stats(const std::vector - tablet->tablet_schema()->column(uid).variant_max_sparse_column_statistics_size()) { + tablet->tablet_schema()->column_by_uid(uid).variant_max_sparse_column_statistics_size()) { // When there is only one segment, we can ensure that the size of each path in output stats is accurate if (output->num_segments() == 1) { for (const auto& [path, size] : stats) { From e49ee895ec36e542a809bb5c51f6b06493ca385a Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Wed, 27 Aug 2025 17:21:10 +0800 Subject: [PATCH 08/16] fix format --- be/src/vec/common/schema_util.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 2ba4c09d8c8ccd..406bec7a351f5d 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -806,8 +806,9 @@ Status VariantCompactionUtil::check_path_stats(const std::vector - tablet->tablet_schema()->column_by_uid(uid).variant_max_sparse_column_statistics_size()) { + if (stats.size() > tablet->tablet_schema() + ->column_by_uid(uid) + .variant_max_sparse_column_statistics_size()) { // When there is only one segment, we can ensure that the size of each path in output stats is accurate if (output->num_segments() == 1) { for (const auto& [path, size] : stats) { From 3f8cd97d95efdc01b43722e439a1a666e3f21e74 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Thu, 28 Aug 2025 17:20:36 +0800 Subject: [PATCH 09/16] fix ut case --- .../variant_stats_calculator_test.cpp | 41 ++++++++++++++----- .../org/apache/doris/catalog/VariantType.java | 2 +- .../conf/regression-conf-custom.groovy | 1 + .../pipeline/p0/conf/regression-conf.groovy | 1 + ...ariant_compaction_with_sparse_limit.groovy | 6 +-- 5 files changed, 36 insertions(+), 15 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp index 6591c799945c0e..31f694536ad68f 100644 --- a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp @@ -71,8 +71,10 @@ class VariantStatsCalculatorTest : public ::testing::Test { } // Helper method to create a footer column with path info - void add_footer_column_with_path(int32_t parent_unique_id, const std::string& path) { + void add_footer_column_with_path(int32_t parent_unique_id, const std::string& path, + uint32_t column_id = 0) { auto* column_meta = _footer->add_columns(); + column_meta->set_column_id(column_id); column_meta->set_unique_id(100 + _footer->columns_size()); auto* path_info = column_meta->mutable_column_path_info(); @@ -202,19 +204,26 @@ TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSubColumn) { TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) { // Setup footer with sparse column - add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__"); + add_footer_column_with_path(-1, "sparse_col"); + add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__", 1); // Create variant sparse column + TabletColumn parent_column = create_variant_column(1, "variant_col", -1, "sparse_col"); TabletColumn sparse_column = create_variant_column(2, "variant_col.__DORIS_VARIANT_SPARSE__", 1, "sparse_col.__DORIS_VARIANT_SPARSE__"); + _tablet_schema->append_column(parent_column); _tablet_schema->append_column(sparse_column); - std::vector column_ids = {0}; + std::vector column_ids = {0, 1}; VariantStatsCaculator calculator(_footer.get(), _tablet_schema, column_ids); // Create block with map column (sparse column) vectorized::Block block; auto map_column = create_map_column(); + auto string_column = vectorized::ColumnString::create(); + // add parant column to block + block.insert({std::move(string_column), std::make_shared(), + "variant_column"}); block.insert({std::move(map_column), std::make_shared( std::make_shared(), @@ -225,7 +234,7 @@ TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) { EXPECT_TRUE(status.ok()); // Check that variant statistics were updated - auto& column_meta = _footer->columns(0); + auto& column_meta = _footer->columns(1); EXPECT_TRUE(column_meta.has_variant_statistics()); } @@ -275,10 +284,15 @@ TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMissingPathInFooter) } TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) { + // parent column + add_footer_column_with_path(-1, "variant"); + TabletColumn parent_column = create_variant_column(1, "variant", -1, "variant"); + _tablet_schema->append_column(parent_column); + // Setup footer with multiple columns - add_footer_column_with_path(1, "sub1"); - add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__"); - add_footer_column_with_path(2, "another_sub"); + add_footer_column_with_path(1, "sub1", 1); + add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__", 2); + add_footer_column_with_path(2, "another_sub", 3); // Create multiple variant columns TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1"); @@ -290,12 +304,17 @@ TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) { _tablet_schema->append_column(sparse); _tablet_schema->append_column(sub2); - std::vector column_ids = {0, 1, 2}; + std::vector column_ids = {0, 1, 2, 3}; VariantStatsCaculator calculator(_footer.get(), _tablet_schema, column_ids); // Create block with multiple columns vectorized::Block block; + // parent column + auto string_column = vectorized::ColumnString::create(); + string_column->insert_data("test", 4); + block.insert({std::move(string_column), std::make_shared(), + "variant_column"}); auto nullable_col1 = create_nullable_column({false, true, false}, {"a", "", "c"}); block.insert({std::move(nullable_col1), std::make_shared( @@ -320,9 +339,9 @@ TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) { EXPECT_TRUE(status.ok()); // Check that statistics were updated for sub columns - EXPECT_EQ(_footer->columns(0).none_null_size(), 2); // sub1: 2 non-null - EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column - EXPECT_EQ(_footer->columns(2).none_null_size(), 1); // another_sub: 2 non-null + EXPECT_EQ(_footer->columns(1).none_null_size(), 2); // sub1: 2 non-null + EXPECT_TRUE(_footer->columns(2).has_variant_statistics()); // sparse column + EXPECT_EQ(_footer->columns(3).none_null_size(), 1); // another_sub: 2 non-null } TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) { diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index ce809a1efdf8a2..28b1177e398715 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -129,7 +129,7 @@ public String toSql(int depth) { sb.append("\"variant_enable_typed_paths_to_sparse\" = \"") .append(String.valueOf(enableTypedPathsToSparse)).append("\""); } - if (variantMaxSparseColumnStatisticsSize != 0) { + if (variantMaxSparseColumnStatisticsSize != 10000) { sb.append(","); sb.append("\"variant_max_sparse_column_statistics_size\" = \"") .append(String.valueOf(variantMaxSparseColumnStatisticsSize)).append("\""); diff --git a/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy b/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy index 9f45ce32bb1fc0..357cdaab711454 100644 --- a/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy +++ b/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy @@ -82,6 +82,7 @@ excludeDirectories = "000_the_start_sentinel_do_not_touch," + // keep this line "ccr_mow_syncer_p0," + "hdfs_vault_p2," + "inject_hdfs_vault_p0," + + "variant_p0/nested," + "plsql_p0," + // plsql is not developped any more, add by sk. "zzz_the_end_sentinel_do_not_touch" // keep this line as the last line diff --git a/regression-test/pipeline/p0/conf/regression-conf.groovy b/regression-test/pipeline/p0/conf/regression-conf.groovy index 6dfc7d269c28b2..29f781b5465e00 100644 --- a/regression-test/pipeline/p0/conf/regression-conf.groovy +++ b/regression-test/pipeline/p0/conf/regression-conf.groovy @@ -80,6 +80,7 @@ excludeDirectories = "000_the_start_sentinel_do_not_touch," + // keep this line "cloud_p0," + "workload_manager_p1," + "plsql_p0," + // plsql is not developped any more, add by sk + "variant_p0/nested," + "zzz_the_end_sentinel_do_not_touch"// keep this line as the last line customConf1 = "test_custom_conf_value" diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index e29bbaa4fca334..4a536f4189c721 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -46,9 +46,6 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { if (key_type == "AGGREGATE") { var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double> replace" } - def create_tbl_res = sql """ show create table ${tableName} """ - logger.info("${create_tbl_res}") - assertTrue(create_tbl_res.toString().contains("variant_max_sparse_column_statistics_size")) sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( @@ -59,6 +56,9 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { DISTRIBUTED BY HASH(k) BUCKETS ${buckets} properties("replication_num" = "1", "disable_auto_compaction" = "true"); """ + def create_tbl_res = sql """ show create table ${tableName} """ + logger.info("${create_tbl_res}") + assertTrue(create_tbl_res.toString().contains("variant_max_sparse_column_statistics_size")) } def key_types = ["DUPLICATE", "UNIQUE", "AGGREGATE"] // def key_types = ["AGGREGATE"] From 692463019620181da5f9e4628d95d4fdd1b1aa7a Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Thu, 28 Aug 2025 21:51:50 +0800 Subject: [PATCH 10/16] fix cases --- .../predefine/test_variant_compaction_with_sparse_limit.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index 4a536f4189c721..e6449823d7901e 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -42,7 +42,7 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { int max_sparse_column_statistics_size = 2 def create_table = { tableName, buckets="auto", key_type="DUPLICATE" -> sql "DROP TABLE IF EXISTS ${tableName}" - def var_def = "variant " + def var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double>" if (key_type == "AGGREGATE") { var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double> replace" } From 9324531e3c8d0903c7c253a4d380f55b93856499 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Mon, 1 Sep 2025 18:23:27 +0800 Subject: [PATCH 11/16] fix test_variant_compaction_with_sparse_limit --- .../test_variant_compaction_with_sparse_limit.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy index e6449823d7901e..d47c486047e042 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy @@ -42,9 +42,9 @@ suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") { int max_sparse_column_statistics_size = 2 def create_table = { tableName, buckets="auto", key_type="DUPLICATE" -> sql "DROP TABLE IF EXISTS ${tableName}" - def var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double>" + def var_def = "variant " if (key_type == "AGGREGATE") { - var_def = "variant 'sala' : int, 'ddd' : double, 'z' : double> replace" + var_def = "variant replace" } sql """ From 6e6d2ce3823e6de11fc9b538b64bf63aff05b1ce Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Tue, 2 Sep 2025 16:13:13 +0800 Subject: [PATCH 12/16] fix comments --- be/src/common/consts.h | 1 + .../variant/variant_column_reader.h | 3 +- be/src/olap/tablet_schema.h | 3 +- .../rowset/segment_v2/mock/mock_segment.h | 3 ++ .../doris/common/PropertyAnalyzerTest.java | 29 +++++++++++++++++++ 5 files changed, 37 insertions(+), 2 deletions(-) diff --git a/be/src/common/consts.h b/be/src/common/consts.h index 1190ded8e46ad2..15b4c84a7b9898 100644 --- a/be/src/common/consts.h +++ b/be/src/common/consts.h @@ -49,5 +49,6 @@ static constexpr int MAX_DECIMALV2_SCALE = 9; static constexpr int MAX_DECIMALV3_PRECISION = MAX_DECIMAL256_PRECISION; static constexpr int MAX_DECIMALV3_SCALE = MAX_DECIMALV3_PRECISION; +static constexpr int DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE = 10000; } // namespace BeConsts } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h index 0e71237172d12a..1ba16881419e35 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h @@ -119,7 +119,8 @@ class VariantColumnReader : public ColumnReader { // key: subcolumn path, value: subcolumn indexes std::unordered_map _variant_subcolumns_indexes; // variant_sparse_column_statistics_size - size_t _variant_sparse_column_statistics_size = 0; + size_t _variant_sparse_column_statistics_size = + BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE; }; class VariantRootColumnIterator : public ColumnIterator { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index e28021e1e3987f..9ef2aa8557fefc 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -290,7 +290,8 @@ class TabletColumn : public MetadataAdder { PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB; bool _variant_enable_typed_paths_to_sparse = false; // set variant_max_sparse_column_statistics_size - int32_t _variant_max_sparse_column_statistics_size = 10000; + int32_t _variant_max_sparse_column_statistics_size = + BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE; }; bool operator==(const TabletColumn& a, const TabletColumn& b); diff --git a/be/test/olap/rowset/segment_v2/mock/mock_segment.h b/be/test/olap/rowset/segment_v2/mock/mock_segment.h index 9cf443b2df0959..f4421c37f7d6f1 100644 --- a/be/test/olap/rowset/segment_v2/mock/mock_segment.h +++ b/be/test/olap/rowset/segment_v2/mock/mock_segment.h @@ -49,6 +49,9 @@ class MockSegment : public Segment { // Helper methods for test setup void add_column_uid_mapping(int32_t col_uid, int32_t footer_ordinal) { + _tablet_schema->_cols.push_back(std::make_shared()); + _tablet_schema->_cols.back()->set_unique_id(col_uid); + _tablet_schema->_field_uniqueid_to_index[col_uid] = footer_ordinal; _column_uid_to_footer_ordinal[col_uid] = footer_ordinal; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java index ab7291eaf161a0..2baaa0a10b78f0 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java @@ -31,6 +31,7 @@ import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import org.apache.doris.thrift.TStorageFormat; import org.apache.doris.thrift.TStorageMedium; +import org.apache.doris.thrift.TPaloBrokerService.AsyncProcessor.pread; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -334,4 +335,32 @@ public void testAnalyzeInvertedIndexFileStorageFormat() throws AnalysisException e.getMessage()); } } + + @Test + public void testAnalyzeVariantMaxSparseColumnStatisticsSize() throws AnalysisException { + Map properties = Maps.newHashMap(); + properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, "-1"); + try { + PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0); + Assertions.fail("Expected AnalysisException was not thrown"); + } catch (AnalysisException e) { + Assertions.assertEquals("variant_max_sparse_column_statistics_size must between 0 and 50000 ", e.getMessage()); + } + properties.clear(); + properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, "50001"); + try { + PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0); + Assertions.fail("Expected AnalysisException was not thrown"); + } catch (AnalysisException e) { + Assertions.assertEquals("variant_max_sparse_column_statistics_size must between 0 and 50000 ", e.getMessage()); + } + properties.clear(); + properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE, "invalid"); + try { + PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0); + Assertions.fail("Expected AnalysisException was not thrown"); + } catch (AnalysisException e) { + Assertions.assertEquals("variant_max_sparse_column_statistics_size format error", e.getMessage()); + } + } } From bd5436756ab06bf75c1c2465952b60ae2cd046f9 Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Tue, 2 Sep 2025 16:31:15 +0800 Subject: [PATCH 13/16] fix feut --- .../test/java/org/apache/doris/common/PropertyAnalyzerTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java index 2baaa0a10b78f0..fabd4136598819 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java @@ -31,7 +31,6 @@ import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import org.apache.doris.thrift.TStorageFormat; import org.apache.doris.thrift.TStorageMedium; -import org.apache.doris.thrift.TPaloBrokerService.AsyncProcessor.pread; import com.google.common.collect.Lists; import com.google.common.collect.Maps; From b644cb335567362f39a1293731bdc6d89a93b4bb Mon Sep 17 00:00:00 2001 From: wangqiannan Date: Tue, 2 Sep 2025 17:40:50 +0800 Subject: [PATCH 14/16] fix feut --- .../java/org/apache/doris/common/util/PropertyAnalyzer.java | 2 +- .../java/org/apache/doris/common/PropertyAnalyzerTest.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index 9b6fab29657bdf..af3514c2f68d47 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -1890,7 +1890,7 @@ public static int analyzeVariantMaxSparseColumnStatisticsSize(Map Date: Tue, 2 Sep 2025 17:48:29 +0800 Subject: [PATCH 15/16] fix fe format --- .../java/org/apache/doris/common/util/PropertyAnalyzer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index af3514c2f68d47..b1d4d2ac897ec7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -1890,7 +1890,7 @@ public static int analyzeVariantMaxSparseColumnStatisticsSize(Map Date: Wed, 3 Sep 2025 14:33:11 +0800 Subject: [PATCH 16/16] fix cloud schema and casez --- .../variant/variant_column_reader.cpp | 3 ++- be/src/vec/common/schema_util.cpp | 2 +- ...ariant_compaction_with_sparse_limit.groovy | 27 ++++++++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp index 158ab719c03e13..2702e8f18804d3 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -81,7 +81,8 @@ bool VariantColumnReader::is_exceeded_sparse_column_limit() const { _variant_sparse_column_statistics_size; DBUG_EXECUTE_IF("exceeded_sparse_column_limit_must_be_false", { if (exceeded_sparse_column_limit) { - return Status::Error( + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, "exceeded_sparse_column_limit_must_be_false, sparse_column_non_null_size: {} : " " _variant_sparse_column_statistics_size: {}", _statistics->sparse_column_non_null_size.size(), diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 6bef33686e33b9..8cbb9331823918 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -809,7 +809,7 @@ Status VariantCompactionUtil::check_path_stats(const std::vector tablet->tablet_schema() + if (stats.size() > output->tablet_schema() ->column_by_uid(uid) .variant_max_sparse_column_statistics_size()) { // When there is only one segment, we can ensure that the size of each path in output stats is accurate diff --git a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy index c8aa11a14471e9..88b228258306e6 100644 --- a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy +++ b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy @@ -154,21 +154,22 @@ suite("test_compaction_variant_with_sparse_limit", "nonConcurrent") { DISTRIBUTED BY HASH(k) BUCKETS 1 properties("replication_num" = "1", "disable_auto_compaction" = "true"); """ + // here will always true sql """insert into tn_simple_variant_DUPLICATE values (1, '{"a" : 1, "b" : 2}');""" - sql """insert into tn_simple_variant_DUPLICATE values (2, '{"d" : "ddd", "s" : "fff", "m": 111}');""" - // here will aways false - try { - GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + test { sql """ select v['a'] from tn_simple_variant_DUPLICATE where k = 1""" - } finally { - GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + exception null } - try { - GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") - sql """ select v['m'] from tn_simple_variant_DUPLICATE where k = 2""" - } catch (e) { - logger.info("catch exception: ${e}") - } finally { - GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + + // here will always false + sql """ truncate table tn_simple_variant_DUPLICATE --force ; """ + sql """insert into tn_simple_variant_DUPLICATE values (1, '{"d" : "ddd", "s" : "fff", "da": "ddd", "m": 111}');""" + test { + sql """ select v['m'] from tn_simple_variant_DUPLICATE""" + exception "exceeded_sparse_column_limit_must_be_false" } + + GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false") + }