From d5133be4b8ebb1586b96c3e2de1a5fc23e606aa6 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Wed, 18 Sep 2024 14:13:30 +0800 Subject: [PATCH] [Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (#40857) ## Proposed changes ### Issue ``` /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:9: runtime error: reference binding to null pointer of type 'doris::StringRef' #0 0x55ee63eb0418 in std::vector>::operator[](unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:2 #1 0x55ee63eb0418 in doris::Status doris::vectorized::OrcReader::_decode_string_non_dict_encoded_column(std::__cxx11::basic_string, std::allocator> const&, COW::mutable_ptr const&, orc::TypeKind const&, orc::EncodedStringVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1172:39 #2 0x55ee63ea2685 in doris::Status doris::vectorized::OrcReader::_decode_string_column(std::__cxx11::basic_string, std::allocator> const&, COW::mutable_ptr const&, orc::TypeKind const&, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1124:16 #3 0x55ee63e97e7a in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column(std::__cxx11::basic_string, std::allocator> const&, COW::mutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1365:16 #4 0x55ee63b0e450 in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column(std::__cxx11::basic_string, std::allocator> const&, COW::immutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5 #5 0x55ee63e99622 in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column(std::__cxx11::basic_string, std::allocator> const&, COW::mutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1410:9 #6 0x55ee63b0e450 in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column(std::__cxx11::basic_string, std::allocator> const&, COW::immutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5 #7 0x55ee63ad4f86 in doris::vectorized::OrcReader::get_next_block_impl(doris::vectorized::Block*, unsigned long*, bool*) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1714:13 #8 0x55ee63ad093b in doris::vectorized::OrcReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1547:5 ``` ### Solution [Fix] (orc-reader) Fix StringRef nullptr data in orc-reader. When string is empty in orc row batch, the data can point anything, maybe nullptr, StringRef has undefined behavior when data is nullptr. Related with #37845. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 42 ++++++++++++++-------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index cffa934cc2c740..16a3c1254c62eb 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1140,8 +1140,9 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_ if (cvb->hasNulls) { for (int i = 0; i < num_values; ++i) { if (cvb->notNull[i]) { - string_values.emplace_back(cvb->data[i], - trim_right(cvb->data[i], cvb->length[i])); + size_t length = trim_right(cvb->data[i], cvb->length[i]); + string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(), + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -1151,21 +1152,26 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_ } } else { for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(cvb->data[i], trim_right(cvb->data[i], cvb->length[i])); + size_t length = trim_right(cvb->data[i], cvb->length[i]); + string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(), + length); } } } else { if (cvb->hasNulls) { for (int i = 0; i < num_values; ++i) { if (cvb->notNull[i]) { - string_values.emplace_back(cvb->data[i], cvb->length[i]); + string_values.emplace_back( + (cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(), + cvb->length[i]); } else { string_values.emplace_back(empty_string.data(), 0); } } } else { for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(cvb->data[i], cvb->length[i]); + string_values.emplace_back( + (cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(), cvb->length[i]); } } } @@ -1204,7 +1210,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -1227,7 +1234,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } else { @@ -1246,7 +1254,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } @@ -1265,7 +1274,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } @@ -2068,7 +2078,7 @@ Status OrcReader::on_string_dicts_loaded( char* val_ptr; int64_t length; dict->getValueByIndex(i, val_ptr, length); - StringRef dict_value(val_ptr, length); + StringRef dict_value((length > 0) ? val_ptr : "", length); if (length > max_value_length) { max_value_length = length; } @@ -2328,7 +2338,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -2346,7 +2357,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } else { @@ -2361,7 +2373,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } @@ -2375,7 +2388,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } }