Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1352,8 +1352,6 @@ DEFINE_Bool(enable_snapshot_action, "false");

DEFINE_mInt32(variant_max_merged_tablet_schema_size, "2048");

DEFINE_mInt32(variant_max_sparse_column_statistics_size, "10000");

DEFINE_mBool(enable_column_type_check, "true");
// 128 MB
DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728");
Expand Down
3 changes: 0 additions & 3 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1414,9 +1414,6 @@ DECLARE_Bool(enable_snapshot_action);
// The max columns size for a tablet schema
DECLARE_mInt32(variant_max_merged_tablet_schema_size);

// The max sparse column statistics size for a variant column
DECLARE_mInt32(variant_max_sparse_column_statistics_size);

DECLARE_mInt64(local_exchange_buffer_mem_limit);

DECLARE_mInt64(enable_debug_log_timeout_secs);
Expand Down
1 change: 1 addition & 0 deletions be/src/common/consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,6 @@ static constexpr int MAX_DECIMALV2_SCALE = 9;

static constexpr int MAX_DECIMALV3_PRECISION = MAX_DECIMAL256_PRECISION;
static constexpr int MAX_DECIMALV3_SCALE = MAX_DECIMALV3_PRECISION;
static constexpr int DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE = 10000;
} // namespace BeConsts
} // namespace doris
26 changes: 20 additions & 6 deletions be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,20 @@ bool VariantColumnReader::exist_in_sparse_column(
}

bool VariantColumnReader::is_exceeded_sparse_column_limit() const {
return !_statistics->sparse_column_non_null_size.empty() &&
_statistics->sparse_column_non_null_size.size() >=
config::variant_max_sparse_column_statistics_size;
bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() &&
_statistics->sparse_column_non_null_size.size() >=
_variant_sparse_column_statistics_size;
DBUG_EXECUTE_IF("exceeded_sparse_column_limit_must_be_false", {
if (exceeded_sparse_column_limit) {
throw doris::Exception(
ErrorCode::INTERNAL_ERROR,
"exceeded_sparse_column_limit_must_be_false, sparse_column_non_null_size: {} : "
" _variant_sparse_column_statistics_size: {}",
_statistics->sparse_column_non_null_size.size(),
_variant_sparse_column_statistics_size);
}
})
return exceeded_sparse_column_limit;
}

int64_t VariantColumnReader::get_metadata_size() const {
Expand Down Expand Up @@ -318,9 +329,7 @@ Status VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,

// Otherwise the prefix is not exist and the sparse column size is reached limit
// which means the path maybe exist in sparse_column
bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() &&
_statistics->sparse_column_non_null_size.size() >=
config::variant_max_sparse_column_statistics_size;
bool exceeded_sparse_column_limit = is_exceeded_sparse_column_limit();

// If the variant column has extracted columns and is a compaction reader, then read flat leaves
// Otherwise read hierarchical data, since the variant subcolumns are flattened in schema_util::VariantCompactionUtil::get_extended_compaction_schema
Expand Down Expand Up @@ -402,6 +411,11 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF
_statistics = std::make_unique<VariantStatistics>();
const ColumnMetaPB& self_column_pb = footer.columns(column_id);
const auto& parent_index = opts.tablet_schema->inverted_indexs(self_column_pb.unique_id());
// record variant_sparse_column_statistics_size from parent column
_variant_sparse_column_statistics_size =
opts.tablet_schema->column_by_uid(self_column_pb.unique_id())
.variant_max_sparse_column_statistics_size();

for (int32_t ordinal = 0; ordinal < footer.columns_size(); ++ordinal) {
const ColumnMetaPB& column_pb = footer.columns(ordinal);
// Find all columns belonging to the current variant column
Expand Down
3 changes: 3 additions & 0 deletions be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class VariantColumnReader : public ColumnReader {
std::unique_ptr<VariantStatistics> _statistics;
// key: subcolumn path, value: subcolumn indexes
std::unordered_map<std::string, TabletIndexes> _variant_subcolumns_indexes;
// variant_sparse_column_statistics_size
size_t _variant_sparse_column_statistics_size =
BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
};

class VariantRootColumnIterator : public ColumnIterator {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ Status VariantColumnWriterImpl::_process_sparse_column(
it != sparse_data_paths_statistics.end()) {
++it->second;
} else if (sparse_data_paths_statistics.size() <
config::variant_max_sparse_column_statistics_size) {
_tablet_column->variant_max_sparse_column_statistics_size()) {
sparse_data_paths_statistics.emplace(path, 1);
}
}
Expand Down
15 changes: 12 additions & 3 deletions be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "olap/rowset/segment_v2/variant_stats_calculator.h"

#include <gen_cpp/segment_v2.pb.h>

#include "common/logging.h"
#include "util/simd/bits.h"
#include "vec/columns/column_nullable.h"
Expand Down Expand Up @@ -63,7 +65,13 @@ Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* b
// Check if this is a sparse column or sub column
if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
// This is a sparse column from variant column
_calculate_sparse_column_stats(*column, column_meta, row_pos, num_rows);
// get variant_max_sparse_column_statistics_size from tablet_schema
size_t variant_max_sparse_column_statistics_size =
_tablet_schema->column_by_uid(tablet_column.parent_unique_id())
.variant_max_sparse_column_statistics_size();
_calculate_sparse_column_stats(*column, column_meta,
variant_max_sparse_column_statistics_size, row_pos,
num_rows);
} else {
// This is a sub column from variant column
_calculate_sub_column_stats(*column, column_meta, row_pos, num_rows);
Expand All @@ -75,13 +83,14 @@ Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* b

void VariantStatsCaculator::_calculate_sparse_column_stats(const vectorized::IColumn& column,
ColumnMetaPB* column_meta,
size_t max_sparse_column_statistics_size,
size_t row_pos, size_t num_rows) {
// Get or create variant statistics
VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();

// Use the same logic as the original calculate_variant_stats function
vectorized::schema_util::VariantCompactionUtil::calculate_variant_stats(column, stats, row_pos,
num_rows);
vectorized::schema_util::VariantCompactionUtil::calculate_variant_stats(
column, stats, max_sparse_column_statistics_size, row_pos, num_rows);

VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
<< stats->sparse_column_non_null_size_size();
Expand Down
4 changes: 3 additions & 1 deletion be/src/olap/rowset/segment_v2/variant_stats_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ class VariantStatsCaculator {

// Helper method to calculate sparse column statistics
void _calculate_sparse_column_stats(const vectorized::IColumn& column,
ColumnMetaPB* column_meta, size_t row_pos, size_t num_rows);
ColumnMetaPB* column_meta,
size_t max_sparse_column_statistics_size, size_t row_pos,
size_t num_rows);

// Helper method to calculate sub column statistics
void _calculate_sub_column_stats(const vectorized::IColumn& column, ColumnMetaPB* column_meta,
Expand Down
4 changes: 4 additions & 0 deletions be/src/olap/tablet_meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,10 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco
column->set_variant_enable_typed_paths_to_sparse(
tcolumn.variant_enable_typed_paths_to_sparse);
}
if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
column->set_variant_max_sparse_column_statistics_size(
tcolumn.variant_max_sparse_column_statistics_size);
}
}

void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) {
Expand Down
6 changes: 6 additions & 0 deletions be/src/olap/tablet_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ void TabletColumn::init_from_pb(const ColumnPB& column) {
if (column.has_variant_enable_typed_paths_to_sparse()) {
_variant_enable_typed_paths_to_sparse = column.variant_enable_typed_paths_to_sparse();
}
if (column.has_variant_max_sparse_column_statistics_size()) {
_variant_max_sparse_column_statistics_size =
column.variant_max_sparse_column_statistics_size();
}
if (column.has_pattern_type()) {
_pattern_type = column.pattern_type();
}
Expand Down Expand Up @@ -765,6 +769,8 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const {
column->set_variant_max_subcolumns_count(_variant_max_subcolumns_count);
column->set_pattern_type(_pattern_type);
column->set_variant_enable_typed_paths_to_sparse(_variant_enable_typed_paths_to_sparse);
column->set_variant_max_sparse_column_statistics_size(
_variant_max_sparse_column_statistics_size);
}

void TabletColumn::add_sub_column(TabletColumn& sub_column) {
Expand Down
12 changes: 12 additions & 0 deletions be/src/olap/tablet_schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@ class TabletColumn : public MetadataAdder<TabletColumn> {
_variant_enable_typed_paths_to_sparse = enable;
}

void set_variant_max_sparse_column_statistics_size(
int32_t variant_max_sparse_column_statistics_size) {
_variant_max_sparse_column_statistics_size = variant_max_sparse_column_statistics_size;
}

int32_t variant_max_subcolumns_count() const { return _variant_max_subcolumns_count; }

PatternTypePB pattern_type() const { return _pattern_type; }
Expand All @@ -234,6 +239,10 @@ class TabletColumn : public MetadataAdder<TabletColumn> {
return _variant_enable_typed_paths_to_sparse;
}

int32_t variant_max_sparse_column_statistics_size() const {
return _variant_max_sparse_column_statistics_size;
}

bool is_decimal() const { return _is_decimal; }

private:
Expand Down Expand Up @@ -280,6 +289,9 @@ class TabletColumn : public MetadataAdder<TabletColumn> {
int32_t _variant_max_subcolumns_count = 0;
PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
bool _variant_enable_typed_paths_to_sparse = false;
// set variant_max_sparse_column_statistics_size
int32_t _variant_max_sparse_column_statistics_size =
BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
};

bool operator==(const TabletColumn& a, const TabletColumn& b);
Expand Down
16 changes: 9 additions & 7 deletions be/src/vec/common/schema_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,9 @@ Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr

// In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
// which leads to inaccurate statistics
if (stats.size() > config::variant_max_sparse_column_statistics_size) {
if (stats.size() > output->tablet_schema()
->column_by_uid(uid)
.variant_max_sparse_column_statistics_size()) {
// When there is only one segment, we can ensure that the size of each path in output stats is accurate
if (output->num_segments() == 1) {
for (const auto& [path, size] : stats) {
Expand Down Expand Up @@ -933,7 +935,8 @@ void VariantCompactionUtil::get_compaction_subcolumns(
VLOG_DEBUG << "append typed column " << subpath;
} else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
sparse_paths.size() >= config::variant_max_sparse_column_statistics_size) {
sparse_paths.size() >=
parent_column->variant_max_sparse_column_statistics_size()) {
TabletColumn subcolumn;
subcolumn.set_name(column_name);
subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
Expand Down Expand Up @@ -1031,6 +1034,7 @@ Status VariantCompactionUtil::get_extended_compaction_schema(
// Calculate statistics about variant data paths from the encoded sparse column
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
segment_v2::VariantStatisticsPB* stats,
size_t max_sparse_column_statistics_size,
size_t row_pos, size_t num_rows) {
// Cast input column to ColumnMap type since sparse column is stored as a map
const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
Expand All @@ -1055,19 +1059,17 @@ void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_spars
}
// If path doesn't exist and we haven't hit the max statistics size limit,
// add it with count 1
else if (count_map.size() < config::variant_max_sparse_column_statistics_size) {
else if (count_map.size() < max_sparse_column_statistics_size) {
count_map.emplace(sparse_path, 1);
}
}
}

if (stats->sparse_column_non_null_size().size() >
config::variant_max_sparse_column_statistics_size) {
if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
throw doris::Exception(
ErrorCode::INTERNAL_ERROR,
"Sparse column non null size: {} is greater than max statistics size: {}",
stats->sparse_column_non_null_size().size(),
config::variant_max_sparse_column_statistics_size);
stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
}
}

Expand Down
3 changes: 2 additions & 1 deletion be/src/vec/common/schema_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ class VariantCompactionUtil {

// Calculate statistics about variant data paths from the encoded sparse column
static void calculate_variant_stats(const IColumn& encoded_sparse_column,
segment_v2::VariantStatisticsPB* stats, size_t row_pos,
segment_v2::VariantStatisticsPB* stats,
size_t max_sparse_column_statistics_size, size_t row_pos,
size_t num_rows);

static void get_compaction_subcolumns(TabletSchema::PathsSetInfo& paths_set_info,
Expand Down
3 changes: 3 additions & 0 deletions be/test/olap/rowset/segment_v2/mock/mock_segment.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class MockSegment : public Segment {

// Helper methods for test setup
void add_column_uid_mapping(int32_t col_uid, int32_t footer_ordinal) {
_tablet_schema->_cols.push_back(std::make_shared<TabletColumn>());
_tablet_schema->_cols.back()->set_unique_id(col_uid);
_tablet_schema->_field_uniqueid_to_index[col_uid] = footer_ordinal;
_column_uid_to_footer_ordinal[col_uid] = footer_ordinal;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ static void construct_column(ColumnPB* column_pb, int32_t col_unique_id,
column_pb->set_is_nullable(is_nullable);
if (column_type == "VARIANT") {
column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count);
column_pb->set_variant_max_sparse_column_statistics_size(10000);
}
}

Expand Down Expand Up @@ -505,15 +506,15 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) {
// 13. check statistics size == limit
auto& variant_stats = variant_column_reader->_statistics;
EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() <
config::variant_max_sparse_column_statistics_size);
auto limit = config::variant_max_sparse_column_statistics_size -
variant_column_reader->_variant_sparse_column_statistics_size);
auto limit = variant_column_reader->_variant_sparse_column_statistics_size -
variant_stats->sparse_column_non_null_size.size();
for (int i = 0; i < limit; ++i) {
std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i);
variant_stats->sparse_column_non_null_size[key] = 10000;
}
EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() ==
config::variant_max_sparse_column_statistics_size);
variant_column_reader->_variant_sparse_column_statistics_size);
EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit());

ColumnIteratorUPtr it2;
Expand Down
Loading
Loading