diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 392d186c8fd0ae..1d2e1c092b0054 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -38,20 +38,30 @@ namespace doris::vectorized { template void ColumnStr::sanity_check() const { +#ifndef NDEBUG + sanity_check_simple(); auto count = offsets.size(); - if (chars.size() != offsets[count - 1]) { - throw Exception(Status::FatalError("row count: {}, chars.size(): {}, offset[{}]: ", count, - chars.size(), count - 1, offsets[count - 1])); - } - if (offsets[-1] != 0) { - throw Exception(Status::FatalError("wrong offsets[-1]: {}", offsets[-1])); - } for (size_t i = 0; i < count; ++i) { if (offsets[i] < offsets[i - 1]) { - throw Exception(Status::FatalError("row count: {}, offsets[{}]: {}, offsets[{}]: {}", - count, i, offsets[i], i - 1, offsets[i - 1])); + throw Exception(Status::InternalError("row count: {}, offsets[{}]: {}, offsets[{}]: {}", + count, i, offsets[i], i - 1, offsets[i - 1])); } } +#endif +} + +template +void ColumnStr::sanity_check_simple() const { +#ifndef NDEBUG + auto count = offsets.size(); + if (chars.size() != offsets[count - 1]) { + throw Exception(Status::InternalError("row count: {}, chars.size(): {}, offset[{}]: ", + count, chars.size(), offsets[count - 1])); + } + if (offsets[-1] != 0) { + throw Exception(Status::InternalError("wrong offsets[-1]: {}", offsets[-1])); + } +#endif } template @@ -147,6 +157,7 @@ void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IC src_concrete.offsets[start + i] - nested_offset + prev_max_offset; } } + sanity_check_simple(); } template @@ -201,6 +212,7 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le } else { do_insert(assert_cast&>(src)); } + sanity_check_simple(); } template @@ -223,6 +235,7 @@ void ColumnStr::insert_many_from(const IColumn& src, size_t position, size_t offsets[start_pos] = static_cast(prev_pos + data_length); prev_pos = prev_pos + data_length; } + sanity_check_simple(); } template @@ -268,6 +281,7 @@ void ColumnStr::insert_indices_from(const IColumn& src, const uint32_t* indic } else { do_insert(assert_cast&>(src)); } + sanity_check_simple(); } template @@ -308,6 +322,7 @@ ColumnPtr ColumnStr::filter(const IColumn::Filter& filt, ssize_t result_size_ filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, result_size_hint); + sanity_check_simple(); return res; } else { throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, @@ -324,7 +339,9 @@ size_t ColumnStr::filter(const IColumn::Filter& filter) { } if constexpr (std::is_same_v) { - return filter_arrays_impl(chars, offsets, filter); + auto res = filter_arrays_impl(chars, offsets, filter); + sanity_check(); + return res; } else { throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr"); @@ -344,6 +361,7 @@ Status ColumnStr::filter_by_selector(const uint16_t* sel, size_t sel_size, IC } filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, sel_size); + sanity_check(); return Status::OK(); } else { return Status::InternalError("should not call filter_by_selector in ColumnStr"); @@ -400,7 +418,7 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) current_new_offset += string_size; res_offsets[i] = current_new_offset; } - + sanity_check(); return res; } @@ -432,6 +450,7 @@ const char* ColumnStr::deserialize_and_insert_from_arena(const char* pos) { memcpy(chars.data() + old_size, pos, string_size); offsets.push_back(new_size); + sanity_check_simple(); return pos + string_size; } @@ -569,28 +588,25 @@ ColumnPtr ColumnStr::replicate(const IColumn::Offsets& replicate_offsets) con res_chars.reserve(chars.size() / col_size * replicate_offsets.back()); res_offsets.reserve(replicate_offsets.back()); - T prev_replicate_offset = 0; - T prev_string_offset = 0; T current_new_offset = 0; - for (size_t i = 0; i < col_size; ++i) { - T size_to_replicate = replicate_offsets[i] - prev_replicate_offset; - T string_size = offsets[i] - prev_string_offset; + size_t size_to_replicate = replicate_offsets[i] - replicate_offsets[i - 1]; + T string_size = offsets[i] - offsets[i - 1]; + + check_chars_length(res_chars.size() + size_to_replicate * string_size, + res_offsets.size() + size_to_replicate, col_size); + res_chars.resize(res_chars.size() + size_to_replicate * string_size); for (size_t j = 0; j < size_to_replicate; ++j) { + memcpy_small_allow_read_write_overflow15(&res_chars[current_new_offset], + &chars[offsets[i - 1]], string_size); current_new_offset += string_size; res_offsets.push_back(current_new_offset); - - res_chars.resize(res_chars.size() + string_size); - memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() - string_size], - &chars[prev_string_offset], string_size); } - - prev_replicate_offset = replicate_offsets[i]; - prev_string_offset = offsets[i]; } check_chars_length(res_chars.size(), res_offsets.size(), col_size); + sanity_check_simple(); return res; } @@ -605,9 +621,11 @@ void ColumnStr::resize(size_t n) { auto origin_size = size(); if (origin_size > n) { offsets.resize(n); + chars.resize(offsets[n - 1]); } else if (origin_size < n) { insert_many_defaults(n - origin_size); } + sanity_check_simple(); } template diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 663badc839dc44..287ef66a2d9f14 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -111,6 +111,8 @@ class ColumnStr final : public COWHelper> { bool is_variable_length() const override { return true; } // used in string ut testd void sanity_check() const; + void sanity_check_simple() const; + std::string get_name() const override { return "String"; } size_t size() const override { return offsets.size(); } @@ -166,6 +168,7 @@ class ColumnStr final : public COWHelper> { chars.resize(new_size); memcpy(chars.data() + old_size, s.data, size_to_append); offsets.push_back(new_size); + sanity_check_simple(); } void insert_many_from(const IColumn& src, size_t position, size_t length) override; @@ -192,6 +195,7 @@ class ColumnStr final : public COWHelper> { size_to_append); offsets.push_back(new_size); } + sanity_check_simple(); } void insert_data(const char* pos, size_t length) override { @@ -204,6 +208,7 @@ class ColumnStr final : public COWHelper> { memcpy(chars.data() + old_size, pos, length); } offsets.push_back(new_size); + sanity_check_simple(); } void insert_data_without_reserve(const char* pos, size_t length) { @@ -216,6 +221,7 @@ class ColumnStr final : public COWHelper> { memcpy(chars.data() + old_size, pos, length); } offsets.push_back_without_reserve(new_size); + sanity_check_simple(); } /// Before insert strings, the caller should calculate the total size of strings, @@ -249,6 +255,7 @@ class ColumnStr final : public COWHelper> { } check_chars_length(offset, offsets.size()); chars.resize(offset); + sanity_check_simple(); } void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, @@ -274,6 +281,7 @@ class ColumnStr final : public COWHelper> { offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset; } DCHECK(chars.size() == offsets.back()); + sanity_check_simple(); } void insert_many_strings(const StringRef* strings, size_t num) override { @@ -296,6 +304,7 @@ class ColumnStr final : public COWHelper> { } offsets.push_back(offset); } + sanity_check_simple(); } template @@ -320,6 +329,7 @@ class ColumnStr final : public COWHelper> { offsets.push_back(offset); } chars.resize(old_size + new_size); + sanity_check_simple(); } void insert_many_strings_overflow(const StringRef* strings, size_t num, @@ -337,6 +347,7 @@ class ColumnStr final : public COWHelper> { } else { insert_many_strings(strings, num); } + sanity_check_simple(); } void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, @@ -367,12 +378,14 @@ class ColumnStr final : public COWHelper> { memcpy(chars.data() + old_size, src.data, src.size); old_size += src.size; } + sanity_check_simple(); } void pop_back(size_t n) override { size_t nested_n = offsets.back() - offset_at(offsets.size() - n); chars.resize(chars.size() - nested_n); offsets.resize_assume_reserved(offsets.size() - n); + sanity_check_simple(); } StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; @@ -489,6 +502,7 @@ class ColumnStr final : public COWHelper> { void insert_many_defaults(size_t length) override { offsets.resize_fill(offsets.size() + length, static_cast(chars.size())); + sanity_check_simple(); } int compare_at(size_t n, size_t m, const IColumn& rhs_, diff --git a/be/src/vec/functions/function_decode_varchar.cpp b/be/src/vec/functions/function_decode_varchar.cpp index be41df0e08232b..65b95de6c6d912 100644 --- a/be/src/vec/functions/function_decode_varchar.cpp +++ b/be/src/vec/functions/function_decode_varchar.cpp @@ -106,6 +106,7 @@ class FunctionDecodeAsVarchar : public IFunction { simd::reverse_copy_bytes(col_res_data.data() + col_res_offset[i - 1], str_size, ui8_ptr + sizeof(IntegerType) - str_size, str_size); } + col_res_data.resize(col_res_offset[col_res_offset.size() - 1]); block.get_by_position(result).column = std::move(col_res); diff --git a/be/src/vec/functions/function_ip.h b/be/src/vec/functions/function_ip.h index 17f9e07e46a857..5fed82d6e10a44 100644 --- a/be/src/vec/functions/function_ip.h +++ b/be/src/vec/functions/function_ip.h @@ -335,6 +335,7 @@ class FunctionIPv6NumToString : public IFunction { process_ipv6_column(column, input_rows_count, vec_res, offsets_res, null_map, ipv6_address_data); } + vec_res.resize(offsets_res[offsets_res.size() - 1]); block.replace_by_position(result, ColumnNullable::create(std::move(col_res), std::move(null_map))); @@ -1319,6 +1320,8 @@ class FunctionCutIPv6 : public IFunction { offsets_res[i] = cast_set(pos - begin); } + chars_res.resize(offsets_res[offsets_res.size() - 1]); + block.replace_by_position(result, std::move(col_res)); return Status::OK(); } diff --git a/be/src/vec/functions/function_uuid.cpp b/be/src/vec/functions/function_uuid.cpp index 1fb2f855bbb6df..f74c95db708e94 100644 --- a/be/src/vec/functions/function_uuid.cpp +++ b/be/src/vec/functions/function_uuid.cpp @@ -180,6 +180,7 @@ class FunctionInttoUuid : public IFunction { col_offset[row] = col_offset[row - 1] + str_length; deserialize((char*)arg, col_data.data() + str_length * row); } + col_data.resize(str_length * input_rows_count); block.replace_by_position(result, std::move(result_column)); return Status::OK(); }