Skip to content

Commit df86f27

Browse files
authored
[opt](olap) Optimize the performance of StructFileColumnIterator::read_by_rowids in scenarios where the rowids are continuous (#58851)
### What problem does this PR solve? Avoid seeking and reading row by row. Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [ ] Yes. <!-- Add document PR link here. eg: apache/doris-website#1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into -->
1 parent 6b04964 commit df86f27

File tree

3 files changed

+115
-18
lines changed

3 files changed

+115
-18
lines changed

be/src/olap/rowset/segment_v2/column_reader.cpp

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,7 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr
966966
return Status::OK();
967967
}
968968

969-
auto& column_map = assert_cast<vectorized::ColumnMap&>(
969+
auto& column_map = assert_cast<vectorized::ColumnMap&, TypeCheckOnRelease::DISABLE>(
970970
dst->is_nullable() ? static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
971971
: *dst);
972972
auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable();
@@ -1010,7 +1010,8 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr
10101010
RETURN_IF_ERROR(
10111011
_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null));
10121012
} else {
1013-
auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
1013+
auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>(
1014+
*null_map_ptr);
10141015
null_map.insert_many_vals(0, num_read);
10151016
}
10161017
DCHECK(num_read == *n);
@@ -1095,7 +1096,8 @@ Status MapFileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t
10951096
ordinal_t ns = 0;
10961097
RETURN_IF_ERROR(_offsets_iterator->_peek_one_offset(&ns));
10971098
// overwrite with sentinel
1098-
assert_cast<vectorized::ColumnOffset64&>(*next_starts_col).get_data()[i] = ns;
1099+
assert_cast<vectorized::ColumnOffset64&, TypeCheckOnRelease::DISABLE>(*next_starts_col)
1100+
.get_data()[i] = ns;
10991101
}
11001102
}
11011103

@@ -1260,7 +1262,7 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn
12601262
return Status::OK();
12611263
}
12621264

1263-
auto& column_struct = assert_cast<vectorized::ColumnStruct&>(
1265+
auto& column_struct = assert_cast<vectorized::ColumnStruct&, TypeCheckOnRelease::DISABLE>(
12641266
dst->is_nullable() ? static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
12651267
: *dst);
12661268
for (size_t i = 0; i < column_struct.tuple_size(); i++) {
@@ -1286,7 +1288,8 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn
12861288
RETURN_IF_ERROR(
12871289
_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null));
12881290
} else {
1289-
auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
1291+
auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>(
1292+
*null_map_ptr);
12901293
null_map.insert_many_vals(0, num_read);
12911294
}
12921295
DCHECK(num_read == *n);
@@ -1318,12 +1321,33 @@ Status StructFileColumnIterator::read_by_rowids(const rowid_t* rowids, const siz
13181321
return Status::OK();
13191322
}
13201323

1321-
for (size_t i = 0; i < count; ++i) {
1322-
RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
1323-
size_t num_read = 1;
1324+
if (count == 0) {
1325+
return Status::OK();
1326+
}
1327+
1328+
size_t this_run = 1;
1329+
auto start_idx = rowids[0];
1330+
auto last_idx = rowids[0];
1331+
for (size_t i = 1; i < count; ++i) {
1332+
if (last_idx == rowids[i] - 1) {
1333+
last_idx = rowids[i];
1334+
this_run++;
1335+
continue;
1336+
}
1337+
RETURN_IF_ERROR(seek_to_ordinal(start_idx));
1338+
size_t num_read = this_run;
13241339
RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
1325-
DCHECK(num_read == 1);
1340+
DCHECK_EQ(num_read, this_run);
1341+
1342+
start_idx = rowids[i];
1343+
last_idx = rowids[i];
1344+
this_run = 1;
13261345
}
1346+
1347+
RETURN_IF_ERROR(seek_to_ordinal(start_idx));
1348+
size_t num_read = this_run;
1349+
RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
1350+
DCHECK_EQ(num_read, this_run);
13271351
return Status::OK();
13281352
}
13291353

@@ -1425,8 +1449,9 @@ Status OffsetFileColumnIterator::_peek_one_offset(ordinal_t* offset) {
14251449
_peek_tmp_col->clear();
14261450
RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&n, _peek_tmp_col)); // not null
14271451
DCHECK(_peek_tmp_col->size() == 1);
1428-
*offset =
1429-
assert_cast<const vectorized::ColumnOffset64*>(_peek_tmp_col.get())->get_element(0);
1452+
*offset = assert_cast<const vectorized::ColumnOffset64*, TypeCheckOnRelease::DISABLE>(
1453+
_peek_tmp_col.get())
1454+
->get_element(0);
14301455
} else {
14311456
*offset = _offset_iterator->get_current_page()->next_array_item_ordinal;
14321457
}
@@ -1557,7 +1582,8 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnP
15571582
RETURN_IF_ERROR(
15581583
_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null));
15591584
} else {
1560-
auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
1585+
auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>(
1586+
*null_map_ptr);
15611587
null_map.insert_many_vals(0, num_read);
15621588
}
15631589
DCHECK(num_read == *n);
@@ -2137,7 +2163,8 @@ void DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP
21372163

21382164
Status RowIdColumnIteratorV2::next_batch(size_t* n, vectorized::MutableColumnPtr& dst,
21392165
bool* has_null) {
2140-
auto* string_column = assert_cast<vectorized::ColumnString*>(dst.get());
2166+
auto* string_column =
2167+
assert_cast<vectorized::ColumnString*, TypeCheckOnRelease::DISABLE>(dst.get());
21412168

21422169
for (uint32_t i = 0; i < *n; ++i) {
21432170
uint32_t row_id = _current_rowid + i;

regression-test/data/datatype_p0/complex_types/test_pruned_columns.out

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,77 @@
11
-- This file is automatically generated. You should know what you did if you want to edit this
22
-- !sql --
3-
1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]}
4-
2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]}
3+
1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}], "value":1}
4+
2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}], "value":2}
5+
3 {"city":"guangzhou", "data":[{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}], "value":3}
6+
4 {"city":"shenzhen", "data":[{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}], "value":4}
7+
5 {"city":"hangzhou", "data":[{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}], "value":5}
8+
6 {"city":"nanjing", "data":[{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}], "value":6}
9+
7 {"city":"tianjin", "data":[{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}], "value":7}
10+
8 {"city":"chongqing", "data":[{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}], "value":8}
11+
9 {"city":"wuhan", "data":[{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}], "value":9}
12+
10 {"city":"xian", "data":[{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}], "value":10}
13+
11 {"city":"changsha", "data":[{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}], "value":11}
14+
12 {"city":"qingdao", "data":[{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}], "value":12}
15+
13 {"city":"dalian", "data":[{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}], "value":13}
516

617
-- !sql1 --
718
1 [10]
819

920
-- !sql2 --
1021
1 beijing
1122
2 shanghai
23+
3 guangzhou
24+
4 shenzhen
25+
5 hangzhou
26+
6 nanjing
27+
7 tianjin
28+
8 chongqing
29+
9 wuhan
30+
10 xian
31+
11 changsha
32+
12 qingdao
33+
13 dalian
1234

1335
-- !sql3 --
1436
1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
1537
2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
38+
3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
39+
4 [{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}]
40+
5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
41+
6 [{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}]
42+
7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
43+
8 [{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}]
44+
9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
45+
10 [{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}]
46+
11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
47+
12 [{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}]
48+
13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
1649

1750
-- !sql4 --
1851
1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
1952
2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
53+
3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
54+
5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
55+
7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
56+
9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
57+
11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
58+
13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
2059

2160
-- !sql5 --
2261
1 beijing
2362
2 shanghai
63+
3 guangzhou
64+
5 hangzhou
65+
7 tianjin
66+
9 wuhan
67+
11 changsha
68+
13 dalian
69+
70+
-- !sql5_1 --
71+
61
72+
73+
-- !sql5_2 --
74+
61
2475

2576
-- !sql6 --
2677
2

regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ suite("test_pruned_columns") {
2020
sql """
2121
CREATE TABLE `tbl_test_pruned_columns` (
2222
`id` int NULL,
23-
`s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>> NULL
23+
`s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>, value:int> NULL
2424
) ENGINE=OLAP
2525
DUPLICATE KEY(`id`)
2626
DISTRIBUTED BY RANDOM BUCKETS AUTO
@@ -31,8 +31,19 @@ suite("test_pruned_columns") {
3131

3232
sql """
3333
insert into `tbl_test_pruned_columns` values
34-
(1, named_struct('city', 'beijing', 'data', array(map(1, named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))))),
35-
(2, named_struct('city', 'shanghai', 'data', array(map(2, named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80)))));
34+
(1, named_struct('city', 'beijing', 'data', array(map(1, named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))), 'value', 1)),
35+
(2, named_struct('city', 'shanghai', 'data', array(map(2, named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))), 'value', 2)),
36+
(3, named_struct('city', 'guangzhou', 'data', array(map(1, named_struct('a', 90, 'b', 60.0), 2, named_struct('a', 110, 'b', 40))), 'value', 3)),
37+
(4, named_struct('city', 'shenzhen', 'data', array(map(2, named_struct('a', 130, 'b', 20.0), 1, named_struct('a', 150, 'b', 40))), 'value', 4)),
38+
(5, named_struct('city', 'hangzhou', 'data', array(map(1, named_struct('a', 170, 'b', 80.0), 2, named_struct('a', 190, 'b', 40))), 'value', 5)),
39+
(6, named_struct('city', 'nanjing', 'data', array(map(2, named_struct('a', 210, 'b', 60.0), 1, named_struct('a', 230, 'b', 40))), 'value', 6)),
40+
(7, named_struct('city', 'tianjin', 'data', array(map(1, named_struct('a', 250, 'b', 20.0), 2, named_struct('a', 270, 'b', 40))), 'value', 7)),
41+
(8, named_struct('city', 'chongqing', 'data', array(map(2, named_struct('a', 290, 'b', 80.0), 1, named_struct('a', 310, 'b', 40))), 'value', 8)),
42+
(9, named_struct('city', 'wuhan', 'data', array(map(1, named_struct('a', 330, 'b', 60.0), 2, named_struct('a', 350, 'b', 40))), 'value', 9)),
43+
(10, named_struct('city', 'xian', 'data', array(map(2, named_struct('a', 370, 'b', 20.0), 1, named_struct('a', 390, 'b', 40))), 'value', 10)),
44+
(11, named_struct('city', 'changsha', 'data', array(map(1, named_struct('a', 410, 'b', 80.0), 2, named_struct('a', 430, 'b', 40))), 'value', 11)),
45+
(12, named_struct('city', 'qingdao', 'data', array(map(2, named_struct('a', 450, 'b', 60.0), 1, named_struct('a', 470, 'b', 40))), 'value', 12)),
46+
(13, named_struct('city', 'dalian', 'data', array(map(1, named_struct('a', 490, 'b', 20.0), 2, named_struct('a', 510, 'b', 40))), 'value', 13));
3647
"""
3748

3849
qt_sql """
@@ -59,6 +70,14 @@ suite("test_pruned_columns") {
5970
select id, struct_element(s, 'city') from `tbl_test_pruned_columns` where struct_element(struct_element(s, 'data')[1][2], 'b') = 40 order by 1;
6071
"""
6172

73+
qt_sql5_1 """
74+
select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from `tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
75+
"""
76+
77+
qt_sql5_2 """
78+
select /*+ set enable_prune_nested_column = 0; */ sum(s.value) from `tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
79+
"""
80+
6281
sql """DROP TABLE IF EXISTS `tbl_test_pruned_columns_map`"""
6382
sql """
6483
CREATE TABLE `tbl_test_pruned_columns_map` (

0 commit comments

Comments
 (0)