Skip to content

Commit

Permalink
fix dict page offset zero bug
Browse files Browse the repository at this point in the history
  • Loading branch information
suxiaogang223 committed Oct 6, 2024
1 parent 9b8e65b commit 00bbb4e
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,17 @@ ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader,
_io_ctx(io_ctx) {}

Status ColumnChunkReader::init() {
size_t start_offset = _metadata.__isset.dictionary_page_offset
? _metadata.dictionary_page_offset
: _metadata.data_page_offset;
size_t start_offset =
_metadata.__isset.dictionary_page_offset && _metadata.dictionary_page_offset > 0
? _metadata.dictionary_page_offset
: _metadata.data_page_offset;
size_t chunk_size = _metadata.total_compressed_size;
// create page reader
_page_reader = create_page_reader(_stream_reader, _io_ctx, start_offset, chunk_size,
_metadata.num_values, _offset_index);
// get the block compression codec
RETURN_IF_ERROR(get_block_compression_codec(_metadata.codec, &_block_compress_codec));
if (_metadata.__isset.dictionary_page_offset) {
if (_metadata.__isset.dictionary_page_offset && _metadata.dictionary_page_offset > 0) {
// seek to the directory page
_page_reader->seek_to_page(_metadata.dictionary_page_offset);
// Parse dictionary data when reading
Expand Down
7 changes: 4 additions & 3 deletions be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,10 @@ void ParquetColumnReader::_generate_read_ranges(int64_t start_index, int64_t end
Status ScalarColumnReader::init(io::FileReaderSPtr file, FieldSchema* field, size_t max_buf_size) {
_field_schema = field;
auto& chunk_meta = _chunk_meta.meta_data;
int64_t chunk_start = chunk_meta.__isset.dictionary_page_offset
? chunk_meta.dictionary_page_offset
: chunk_meta.data_page_offset;
int64_t chunk_start =
chunk_meta.__isset.dictionary_page_offset && chunk_meta.dictionary_page_offset > 0
? chunk_meta.dictionary_page_offset
: chunk_meta.data_page_offset;
size_t chunk_len = chunk_meta.total_compressed_size;
size_t prefetch_buffer_size = std::min(chunk_len, max_buf_size);
if (typeid_cast<io::MergeRangeFileReader*>(file.get())) {
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,10 @@ suite("test_hdfs_parquet_group0","external,hive,tvf,external_docker") {


uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/dict-page-offset-zero.parquet"
test {
sql """ select * from HDFS(
order_qt_test_27 """ select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "parquet") limit 10; """
exception "Failed to deserialize parquet page header. offset: 0, header size: 40, end offset: 40, real header size: 40"
}


uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/repeated_no_annotation.parquet"
Expand Down

0 comments on commit 00bbb4e

Please sign in to comment.