From 2654bd57659e36371b9d9081e093d99b2a333547 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Thu, 20 Nov 2025 16:39:00 +0800 Subject: [PATCH] [fix](load) Fix undefined behavior in the JSON reader caused by multiple calls to the get_string method (#58107) image ```text doris_be: /root/doris/thirdparty/installed/include/simdjson/generic/ondemand/json_iterator-inl.h:359: simdjson_result simdjson::fallback::ondemand::json_iterator::unescape(raw_json_string, bool): Assertion `!parser->string_buffer_overflow(_string_buf_loc)' failed. *** Query id: 24468c5d0b372cf0-3e6c777580238e96 *** *** is nereids: 1 *** *** tablet id: 0 *** *** Aborted at 1763545679 (unix time) try "date -d @1763545679" if you are using GNU date *** *** Current BE git commitID: cbc3d21884 *** *** SIGABRT unknown detail explain (@0x3f8001618b8) received by PID 1448120 (TID 1450358 OR 0x7b0fb9f4e700) from PID 1448120; stack trace: *** 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /root/doris/be/src/common/signal_handler.h:420 1# 0x00007F183C476D10 in /lib64/libpthread.so.0 2# __GI_raise in /lib64/libc.so.6 3# __GI_abort in /lib64/libc.so.6 4# _nl_load_domain.cold.0 in /lib64/libc.so.6 5# 0x00007F183BCC8E86 in /lib64/libc.so.6 6# simdjson::fallback::ondemand::json_iterator::unescape(simdjson::fallback::ondemand::raw_json_string, bool) at /root/doris/thirdparty/installed/include/simdjson/generic/ondemand/json_iterator-inl.h:359 7# simdjson::fallback::ondemand::raw_json_string::unescape(simdjson::fallback::ondemand::json_iterator&, bool) const at /root/doris/thirdparty/installed/include/simdjson/generic/ondemand/raw_json_string-inl.h:160 8# simdjson::simdjson_result::unescape(simdjson::fallback::ondemand::json_iterator&, bool) const in /root/doris/be/output/lib/doris_be 9# simdjson::fallback::ondemand::value_iterator::get_string(bool) at /root/doris/thirdparty/installed/include/simdjson/generic/ondemand/value_iterator-inl.h:514 10# simdjson::fallback::ondemand::value::get_string(bool) at /root/doris/thirdparty/installed/include/simdjson/generic/ondemand/value-inl.h:48 11# doris::Status doris::vectorized::NewJsonReader::_simdjson_write_data_to_column(simdjson::fallback::ondemand::value&, std::shared_ptr const&, doris::vectorized::IColumn*, std::__cxx11::basic_string, std::allocator > const&, std::shared_ptr, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:1080 12# doris::vectorized::NewJsonReader::_simdjson_write_columns_by_jsonpath(simdjson::fallback::ondemand::object*, std::vector > const&, doris::vectorized::Block&, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:1460 13# doris::vectorized::NewJsonReader::_simdjson_handle_flat_array_complex_json_write_columns(doris::vectorized::Block&, std::vector > const&, bool*, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:806 14# doris::vectorized::NewJsonReader::_simdjson_handle_flat_array_complex_json(doris::RuntimeState*, doris::vectorized::Block&, std::vector > const&, bool*, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:753 15# doris::vectorized::NewJsonReader::_read_json_column(doris::RuntimeState*, doris::vectorized::Block&, std::vector > const&, bool*, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:485 16# doris::vectorized::NewJsonReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) at /root/doris/be/src/vec/exec/format/json/new_json_reader.cpp:209 17# doris::vectorized::FileScanner::_get_block_wrapped(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/vec/exec/scan/file_scanner.cpp:480 18# doris::vectorized::FileScanner::_get_block_impl(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/vec/exec/scan/file_scanner.cpp:417 19# doris::vectorized::Scanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/vec/exec/scan/scanner.cpp:113 20# doris::vectorized::Scanner::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/vec/exec/scan/scanner.cpp:85 21# doris::vectorized::ScannerScheduler::_scanner_scan(std::shared_ptr, std::shared_ptr) at /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:182 22# doris::vectorized::ScannerScheduler::submit(std::shared_ptr, std::shared_ptr)::$_0::operator()() const::{lambda()#1}::operator()() const::{lambda()#1}::operator()() const at /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:96 23# doris::vectorized::ScannerScheduler::submit(std::shared_ptr, std::shared_ptr)::$_0::operator()() const::{lambda()#1}::operator()() const at /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:95 ``` Issue Number: close #xxx Related PR: #xxx Problem Summary: None - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- .../vec/exec/format/json/new_json_reader.cpp | 36 ++++++-- be/src/vec/exec/format/json/new_json_reader.h | 20 +++++ .../stream_load/load_json_with_jsonpath.out | 10 +++ .../stream_load/test_load_with_jsonpath2.json | 20 +++++ .../load_json_with_jsonpath.groovy | 87 ++++++++++++++++++- 5 files changed, 162 insertions(+), 11 deletions(-) create mode 100644 regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 4055d4cdec2fa2..0cc4b6f78ea287 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1599,6 +1599,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val bool has_valid_value = false; // iterate through object, simdjson::ondemond will parsing on the fly size_t key_index = 0; + for (auto field : *value) { std::string_view key = field.unescaped_key(); StringRef name_ref(key.data(), key.size()); @@ -1627,7 +1628,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val } simdjson::ondemand::value val = field.value(); auto* column_ptr = block.get_by_position(column_index).column->assume_mutable().get(); - RETURN_IF_ERROR(_simdjson_write_data_to_column( + RETURN_IF_ERROR(_simdjson_write_data_to_column( val, slot_descs[column_index]->type(), column_ptr, slot_descs[column_index]->col_name(), _serdes[column_index], valid)); if (!(*valid)) { @@ -1718,6 +1719,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val return Status::OK(); } +template Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value, const TypeDescriptor& type_desc, vectorized::IColumn* column_ptr, @@ -1753,7 +1755,20 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& if (_is_load || !type_desc.is_complex_type()) { if (value.type() == simdjson::ondemand::json_type::string) { - std::string_view value_string = value.get_string(); + std::string_view value_string; + if constexpr (use_string_cache) { + const auto cache_key = value.raw_json().value(); + if (_cached_string_values.contains(cache_key)) { + value_string = _cached_string_values[cache_key]; + } else { + value_string = value.get_string(); + _cached_string_values.emplace(cache_key, value_string); + } + } else { + DCHECK(_cached_string_values.empty()); + value_string = value.get_string(); + } + Slice slice {value_string.data(), value_string.size()}; RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _serde_options)); @@ -1812,7 +1827,7 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& has_value[sub_column_idx] = true; const auto& sub_col_type = type_desc.children[sub_column_idx]; - RETURN_IF_ERROR(_simdjson_write_data_to_column( + RETURN_IF_ERROR(_simdjson_write_data_to_column( sub.value(), sub_col_type, sub_column_ptr, column_name + "." + sub_key, sub_serdes[sub_column_idx], valid)); } @@ -1869,7 +1884,8 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& sub_serdes[0], _serde_options, valid)); simdjson::ondemand::value field_value = member_value.value(); - RETURN_IF_ERROR(_simdjson_write_data_to_column( + + RETURN_IF_ERROR(_simdjson_write_data_to_column( field_value, type_desc.children[1], map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(), column_name + ".value", sub_serdes[1], valid)); @@ -1892,9 +1908,10 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& int field_count = 0; for (simdjson::ondemand::value sub_value : array_value) { - RETURN_IF_ERROR(_simdjson_write_data_to_column( + RETURN_IF_ERROR(_simdjson_write_data_to_column( sub_value, type_desc.children[0], array_column_ptr->get_data().get_ptr(), column_name + ".element", sub_serdes[0], valid)); + field_count++; } auto& offsets = array_column_ptr->get_offsets(); @@ -2085,6 +2102,9 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( Block& block, bool* valid) { // write by jsonpath bool has_valid_value = false; + + Defer clear_defer([this]() { _cached_string_values.clear(); }); + for (size_t i = 0; i < slot_descs.size(); i++) { auto* slot_desc = slot_descs[i]; if (!slot_desc->is_materialized()) { @@ -2119,9 +2139,9 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( return Status::OK(); } } else { - RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value, slot_desc->type(), - column_ptr, slot_desc->col_name(), - _serdes[i], valid)); + RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value, slot_desc->type(), + column_ptr, slot_desc->col_name(), + _serdes[i], valid)); if (!(*valid)) { return Status::OK(); } diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 9f0a39a98b7502..da8b5f1f3e8dda 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -176,6 +177,7 @@ class NewJsonReader : public GenericReader { Status _simdjson_set_column_value(simdjson::ondemand::object* value, Block& block, const std::vector& slot_descs, bool* valid); + template Status _simdjson_write_data_to_column(simdjson::ondemand::value& value, const TypeDescriptor& type_desc, vectorized::IColumn* column_ptr, @@ -291,6 +293,24 @@ class NewJsonReader : public GenericReader { // column to default value string map std::unordered_map _col_default_value_map; + // From document of simdjson: + // ``` + // Important: a value should be consumed once. Calling get_string() twice on the same value is an error. + // ``` + // We should cache the string_views to avoid multiple get_string() calls. + struct StringViewHash { + size_t operator()(const std::string_view& str) const { + return std::hash()(reinterpret_cast(str.data())); + } + }; + struct StringViewEqual { + bool operator()(const std::string_view& lhs, const std::string_view& rhs) const { + return lhs.data() == rhs.data() && lhs.size() == rhs.size(); + } + }; + std::unordered_map + _cached_string_values; + int32_t skip_bitmap_col_idx {-1}; //Used to indicate whether it is a stream load. When loading, only data will be inserted into columnString. diff --git a/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out b/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out index 43037b624dd5a5..84e9bd05565620 100644 --- a/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out +++ b/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out @@ -9,3 +9,13 @@ 1000 7395.231067 2000 \N +-- !test_select2 -- +22 7291 7291 7291 \N I am a long string here. I am a long string here. I am a long string here. I am another long string here 4. I am another long string here 4. I am another long string here 4. I am another long string here 4. I am another long string here 4. I am another long string here 4. I am another long string here 4. I am another long string here 4. \N +7291.703724 2000 2000 2000 \N I am a long string here. I am a long string here. I am a long string here. I am another long string here 3. I am another long string here 3. I am another long string here 3. I am another long string here 3. I am another long string here 3. I am another long string here 3. I am another long string here 3. I am another long string here 3. \N +7395.231067 1000 1000 1000 \N I am a long string here. I am a long string here. I am a long string here. I am another long string here 2. I am another long string here 2. I am another long string here 2. I am another long string here 2. I am another long string here 2. I am another long string here 2. I am another long string here 2. I am another long string here 2. \N + +-- !test_select3 -- +22 7291 I am a long string here. I am another long string here 4. +7291.703724 2000 I am a long string here. I am another long string here 3. +7395.231067 1000 I am a long string here. I am another long string here 2. + diff --git a/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json b/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json new file mode 100644 index 00000000000000..0028701ca0efc8 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json @@ -0,0 +1,20 @@ +[ + { + "k1": "7395.231067", + "k2": "1000", + "k3": "I am a long string here.", + "k4": "I am another long string here 2." + }, + { + "k1": "7291.703724", + "k2": "2000", + "k3": "I am a long string here.", + "k4": "I am another long string here 3." + }, + { + "k1": "22", + "k2": "7291", + "k3": "I am a long string here.", + "k4": "I am another long string here 4." + } +] \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy b/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy index 963cfbfa9a6009..d28a17140b85a2 100644 --- a/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy +++ b/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy @@ -34,7 +34,7 @@ suite("test_load_json_with_jsonpath", "p0") { """ } - def load_array_data = {new_json_reader_flag, table_name, strip_flag, read_flag, format_flag, exprs, json_paths, + def load_array_data = { table_name, strip_flag, read_flag, format_flag, exprs, json_paths, json_root, where_expr, fuzzy_flag, column_sep, file_name -> // load the json data streamLoad { @@ -82,17 +82,98 @@ suite("test_load_json_with_jsonpath", "p0") { create_test_table.call() - load_array_data.call('false', testTable, 'true', '', 'json', '', '["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json') + load_array_data.call(testTable, 'true', '', 'json', '', '["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json') check_data_correct(testTable) // test new json load, should be deleted after new_load_scan ready sql "DROP TABLE IF EXISTS ${testTable}" create_test_table.call() - load_array_data.call('true', testTable, 'true', '', 'json', '', '["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json') + load_array_data.call(testTable, 'true', '', 'json', '', '["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json') check_data_correct(testTable) } finally { try_sql("DROP TABLE IF EXISTS ${testTable}") } + + // case2 with duplicate json paths + try { + sql "DROP TABLE IF EXISTS tbl_test_load_json_with_jsonpath2" + + sql """ + CREATE TABLE IF NOT EXISTS tbl_test_load_json_with_jsonpath2 ( + `k1` varchar(128) NULL COMMENT "", + `k2` int NULL COMMENT "", + `k22` int NULL COMMENT "", + `k222` int NULL COMMENT "", + `k2222` int NULL COMMENT "", + `k3` STRING NULL COMMENT "", + `k33` STRING NULL COMMENT "", + `k333` STRING NULL COMMENT "", + `k3333` STRING NULL COMMENT "", + `k4` STRING NULL COMMENT "", + `k44` STRING NULL COMMENT "", + `k444` STRING NULL COMMENT "", + `k4444` STRING NULL COMMENT "", + `k44444` STRING NULL COMMENT "", + `k444444` STRING NULL COMMENT "", + `k4444444` STRING NULL COMMENT "", + `k44444444` STRING NULL COMMENT "", + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + + load_array_data.call( + 'tbl_test_load_json_with_jsonpath2', + 'true', + '', + 'json', + '', + '["$.k1", "$.k2", "$.k2", "$.k2", "$.k3", "$.k3", "$.k3","$.k3", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4"]', + '', '', '', '', 'test_load_with_jsonpath2.json') + + qt_test_select2 "select * from tbl_test_load_json_with_jsonpath2 order by k1" + + } finally { + + } + + // case3 without json paths + try { + sql "DROP TABLE IF EXISTS tbl_test_load_json_with_jsonpath3" + + sql """ + CREATE TABLE IF NOT EXISTS tbl_test_load_json_with_jsonpath3 ( + `k1` varchar(128) NULL COMMENT "", + `k2` int NULL COMMENT "", + `k3` STRING NULL COMMENT "", + `k4` STRING NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + + load_array_data.call( + 'tbl_test_load_json_with_jsonpath3', + 'true', + '', + 'json', + '', + '[]', + '', '', '', '', 'test_load_with_jsonpath2.json') + + qt_test_select3 "select * from tbl_test_load_json_with_jsonpath3 order by k1" + + } finally { + + } }