From ea1250325bd893781e6d9e64b68e71dff61afb54 Mon Sep 17 00:00:00 2001 From: zclllyybb Date: Thu, 8 May 2025 19:25:38 +0800 Subject: [PATCH] [Fix](field) Fix potential memory leak and wrong binary reading about JsonbField (#50174) --- be/src/runtime/jsonb_value.h | 10 +- be/src/vec/columns/column_string.h | 2 + be/src/vec/common/string_buffer.hpp | 3 + be/src/vec/core/field.cpp | 7 +- be/src/vec/core/field.h | 62 +++++------ be/src/vec/data_types/data_type_jsonb.h | 25 +++-- be/src/vec/io/io_helper.h | 12 ++- be/src/vec/io/var_int.h | 35 ++++-- be/test/vec/core/field_test.cpp | 138 +++++++++++++++++++++++- 9 files changed, 227 insertions(+), 67 deletions(-) diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h index 1df9469e1720cd..951b91b2f34805 100644 --- a/be/src/runtime/jsonb_value.h +++ b/be/src/runtime/jsonb_value.h @@ -42,8 +42,8 @@ struct JsonBinaryValue { size_t len = 0; JsonbParser parser; - JsonBinaryValue() : ptr(nullptr), len(0) {} - JsonBinaryValue(char* ptr, int len) { + JsonBinaryValue() = default; + JsonBinaryValue(char* ptr, size_t len) { static_cast(from_json_string(const_cast(ptr), len)); } JsonBinaryValue(const std::string& s) { @@ -51,11 +51,11 @@ struct JsonBinaryValue { } JsonBinaryValue(const char* ptr, int len) { static_cast(from_json_string(ptr, len)); } - const char* value() { return ptr; } + const char* value() const { return ptr; } - size_t size() { return len; } + size_t size() const { return len; } - void replace(char* ptr, int len) { + void replace(const char* ptr, int len) { this->ptr = ptr; this->len = len; } diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 94c835f600b310..3c9c5829e35ba8 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -165,6 +165,8 @@ class ColumnStr final : public COWHelper> { check_chars_length(new_size, old_size + 1); chars.resize(new_size); + DCHECK(s.data != nullptr); + DCHECK(chars.data() != nullptr); memcpy(chars.data() + old_size, s.data, size_to_append); offsets.push_back(new_size); sanity_check_simple(); diff --git a/be/src/vec/common/string_buffer.hpp b/be/src/vec/common/string_buffer.hpp index 8dca6f057a26d3..6319f6fe5f2a64 100644 --- a/be/src/vec/common/string_buffer.hpp +++ b/be/src/vec/common/string_buffer.hpp @@ -25,6 +25,8 @@ namespace doris::vectorized { +// store and commit data. only after commit the data is effective on its' base(ColumnString) +// everytime commit, the _data add one row. class BufferWritable final { public: explicit BufferWritable(ColumnString& vector) @@ -64,6 +66,7 @@ class BufferWritable final { using VectorBufferWriter = BufferWritable; using BufferWriter = BufferWritable; +// There is consumption of the buffer in the read method. class BufferReadable { public: explicit BufferReadable(StringRef& ref) : _data(ref.data) {} diff --git a/be/src/vec/core/field.cpp b/be/src/vec/core/field.cpp index ac92605007d030..d45a1a719ac7fd 100644 --- a/be/src/vec/core/field.cpp +++ b/be/src/vec/core/field.cpp @@ -26,14 +26,9 @@ #include "vec/io/io_helper.h" #include "vec/io/var_int.h" -namespace doris { -namespace vectorized { +namespace doris::vectorized { class BufferReadable; class BufferWritable; -} // namespace vectorized -} // namespace doris - -namespace doris::vectorized { void read_binary(Array& x, BufferReadable& buf) { size_t size; diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h index 922f9abb13e03e..cceb71e53eebd3 100644 --- a/be/src/vec/core/field.h +++ b/be/src/vec/core/field.h @@ -156,59 +156,61 @@ DEFINE_FIELD_VECTOR(Map); using VariantMap = std::map; +//TODO: rethink if we really need this? it only save one pointer from std::string +// not POD type so could only use read/write_json_binary instead of read/write_binary class JsonbField { public: JsonbField() = default; + ~JsonbField() = default; // unique_ptr will handle cleanup automatically - JsonbField(const char* ptr, uint32_t len) : size(len) { - data = new char[size]; + JsonbField(const char* ptr, size_t len) : size(len) { + data = std::make_unique(size); if (!data) { LOG(FATAL) << "new data buffer failed, size: " << size; } - memcpy(data, ptr, size); + if (size > 0) { + memcpy(data.get(), ptr, size); + } } JsonbField(const JsonbField& x) : size(x.size) { - data = new char[size]; + data = std::make_unique(size); if (!data) { LOG(FATAL) << "new data buffer failed, size: " << size; } - memcpy(data, x.data, size); + if (size > 0) { + memcpy(data.get(), x.data.get(), size); + } } - JsonbField(JsonbField&& x) : data(x.data), size(x.size) { - x.data = nullptr; - x.size = 0; - } + JsonbField(JsonbField&& x) noexcept : data(std::move(x.data)), size(x.size) { x.size = 0; } + // dispatch for all type of storage. so need this. but not really used now. JsonbField& operator=(const JsonbField& x) { - data = new char[size]; - if (!data) { - LOG(FATAL) << "new data buffer failed, size: " << size; + if (this != &x) { + data = std::make_unique(x.size); + if (!data) { + LOG(FATAL) << "new data buffer failed, size: " << x.size; + } + if (x.size > 0) { + memcpy(data.get(), x.data.get(), x.size); + } + size = x.size; } - memcpy(data, x.data, size); return *this; } - JsonbField& operator=(JsonbField&& x) { - if (data) { - delete[] data; + JsonbField& operator=(JsonbField&& x) noexcept { + if (this != &x) { + data = std::move(x.data); + size = x.size; + x.size = 0; } - data = x.data; - size = x.size; - x.data = nullptr; - x.size = 0; return *this; } - ~JsonbField() { - if (data) { - delete[] data; - } - } - - const char* get_value() const { return data; } - uint32_t get_size() const { return size; } + const char* get_value() const { return data.get(); } + size_t get_size() const { return size; } bool operator<(const JsonbField& r) const { LOG(FATAL) << "comparing between JsonbField is not supported"; @@ -246,8 +248,8 @@ class JsonbField { } private: - char* data = nullptr; - uint32_t size = 0; + std::unique_ptr data = nullptr; + size_t size = 0; }; template diff --git a/be/src/vec/data_types/data_type_jsonb.h b/be/src/vec/data_types/data_type_jsonb.h index 3d681e3ce79754..12d7aafbca2d1e 100644 --- a/be/src/vec/data_types/data_type_jsonb.h +++ b/be/src/vec/data_types/data_type_jsonb.h @@ -18,9 +18,9 @@ #pragma once #include -#include -#include +#include +#include #include #include @@ -36,15 +36,13 @@ #include "vec/data_types/serde/data_type_serde.h" #include "vec/data_types/serde/data_type_string_serde.h" -namespace doris { -namespace vectorized { +namespace doris::vectorized { +#include "common/compile_check_begin.h" + class BufferWritable; class IColumn; class ReadBuffer; -} // namespace vectorized -} // namespace doris -namespace doris::vectorized { class DataTypeJsonb final : public IDataType { public: using ColumnType = ColumnString; @@ -68,10 +66,13 @@ class DataTypeJsonb final : public IDataType { MutableColumnPtr create_column() const override; - virtual Field get_default() const override { + Field get_default() const override { std::string default_json = "null"; - JsonBinaryValue binary_val(default_json.c_str(), default_json.size()); - return JsonbField(binary_val.value(), binary_val.size()); + // convert default_json to binary + JsonBinaryValue binary_val(default_json.c_str(), static_cast(default_json.size())); + // Throw exception if default_json.size() is large than INT32_MAX + // JsonbField keeps its own memory + return JsonbField(binary_val.value(), static_cast(binary_val.size())); } Field get_field(const TExprNode& node) const override { @@ -100,4 +101,6 @@ class DataTypeJsonb final : public IDataType { private: DataTypeString data_type_string; }; -} // namespace doris::vectorized \ No newline at end of file + +#include "common/compile_check_end.h" +} // namespace doris::vectorized diff --git a/be/src/vec/io/io_helper.h b/be/src/vec/io/io_helper.h index d5ca522146a1cb..f3dc3d1b131924 100644 --- a/be/src/vec/io/io_helper.h +++ b/be/src/vec/io/io_helper.h @@ -30,6 +30,7 @@ #include "vec/common/string_buffer.hpp" #include "vec/common/string_ref.h" #include "vec/common/uint128.h" +#include "vec/core/field.h" #include "vec/core/types.h" #include "vec/io/reader_buffer.h" #include "vec/io/var_int.h" @@ -126,7 +127,7 @@ inline void write_string_binary(const char* s, BufferWritable& buf) { write_string_binary(StringRef {std::string(s)}, buf); } -inline void write_json_binary(JsonbField s, BufferWritable& buf) { +inline void write_json_binary(const JsonbField& s, BufferWritable& buf) { write_string_binary(StringRef {s.get_value(), s.get_size()}, buf); } @@ -200,13 +201,14 @@ inline StringRef read_string_binary_into(Arena& arena, BufferReadable& buf) { char* data = arena.alloc(size); buf.read(data, size); - return StringRef(data, size); + return {data, size}; } -inline void read_json_binary(JsonbField val, BufferReadable& buf, +inline void read_json_binary(JsonbField& val, BufferReadable& buf, size_t MAX_JSON_SIZE = DEFAULT_MAX_JSON_SIZE) { - StringRef jrf = StringRef {val.get_value(), val.get_size()}; - read_string_binary(jrf, buf); + StringRef result; + read_string_binary(result, buf); + val = JsonbField(result.data, result.size); } template diff --git a/be/src/vec/io/var_int.h b/be/src/vec/io/var_int.h index eda4b6b3873aa8..eb06a1ccb1d147 100644 --- a/be/src/vec/io/var_int.h +++ b/be/src/vec/io/var_int.h @@ -93,35 +93,44 @@ inline void read_var_uint(UInt64& x, std::istream& istr) { for (size_t i = 0; i < 9; ++i) { UInt64 byte = istr.get(); x |= (byte & 0x7F) << (7 * i); - if (!(byte & 0x80)) return; + if (!(byte & 0x80)) { + return; + } } } inline void write_var_uint(UInt64 x, std::ostream& ostr) { for (size_t i = 0; i < 9; ++i) { uint8_t byte = x & 0x7F; - if (x > 0x7F) byte |= 0x80; + if (x > 0x7F) { + byte |= 0x80; + } ostr.put(byte); x >>= 7; - if (!x) return; + if (!x) { + return; + } } } // TODO: do real implement in the future inline void read_var_uint(UInt64& x, BufferReadable& buf) { x = 0; + // get length from first byte firstly uint8_t len = 0; buf.read((char*)&len, 1); auto ref = buf.read(len); - + // read data and set it to x per byte. char* bytes = const_cast(ref.data); for (size_t i = 0; i < 9; ++i) { UInt64 byte = bytes[i]; x |= (byte & 0x7F) << (7 * i); - if (!(byte & 0x80)) return; + if (!(byte & 0x80)) { + return; + } } } @@ -130,12 +139,16 @@ inline void write_var_uint(UInt64 x, BufferWritable& ostr) { uint8_t i = 0; while (i < 9) { uint8_t byte = x & 0x7F; - if (x > 0x7F) byte |= 0x80; + if (x > 0x7F) { + byte |= 0x80; + } bytes[i++] = byte; x >>= 7; - if (!x) break; + if (!x) { + break; + } } ostr.write((char*)&i, 1); ostr.write(bytes, i); @@ -144,13 +157,17 @@ inline void write_var_uint(UInt64 x, BufferWritable& ostr) { inline char* write_var_uint(UInt64 x, char* ostr) { for (size_t i = 0; i < 9; ++i) { uint8_t byte = x & 0x7F; - if (x > 0x7F) byte |= 0x80; + if (x > 0x7F) { + byte |= 0x80; + } *ostr = byte; ++ostr; x >>= 7; - if (!x) return ostr; + if (!x) { + return ostr; + } } return ostr; diff --git a/be/test/vec/core/field_test.cpp b/be/test/vec/core/field_test.cpp index 71d26ea4979bde..4509a4e6fe67b5 100644 --- a/be/test/vec/core/field_test.cpp +++ b/be/test/vec/core/field_test.cpp @@ -22,8 +22,13 @@ #include -#include "gtest/gtest_pred_impl.h" +#include "gtest/gtest_pred_impl.h" // IWYU pragma: keep +#include "runtime/define_primitive_type.h" +#include "vec/columns/column_string.h" +#include "vec/common/string_buffer.hpp" +#include "vec/common/string_ref.h" #include "vec/core/types.h" +#include "vec/io/io_helper.h" namespace doris::vectorized { TEST(VFieldTest, field_string) { @@ -43,4 +48,135 @@ TEST(VFieldTest, field_string) { ASSERT_EQ(f.get()[0].get(), "Hello, world (6)"); } +TEST(VFieldTest, jsonb_field_unique_ptr) { + // Test default constructor + JsonbField empty; + ASSERT_EQ(empty.get_value(), nullptr); + ASSERT_EQ(empty.get_size(), 0); + + // Test constructor with data + const char* test_data = R"({ "key": "value" })"; + size_t test_size = strlen(test_data); + JsonbField jf1(test_data, test_size); + ASSERT_NE(jf1.get_value(), nullptr); + ASSERT_EQ(jf1.get_size(), test_size); + ASSERT_EQ(std::string(jf1.get_value(), jf1.get_size()), std::string(test_data)); + + // Test copy constructor + JsonbField jf2(jf1); + ASSERT_NE(jf2.get_value(), nullptr); + ASSERT_NE(jf2.get_value(), jf1.get_value()); // Different memory locations + ASSERT_EQ(jf2.get_size(), jf1.get_size()); + ASSERT_EQ(std::string(jf2.get_value(), jf2.get_size()), + std::string(jf1.get_value(), jf1.get_size())); + + // Test move constructor + JsonbField jf3(std::move(jf2)); + ASSERT_NE(jf3.get_value(), nullptr); + ASSERT_EQ(jf2.get_value(), nullptr); // jf2 should be empty after move + ASSERT_EQ(jf2.get_size(), 0); // jf2 size should be 0 after move + ASSERT_EQ(jf3.get_size(), test_size); + ASSERT_EQ(std::string(jf3.get_value(), jf3.get_size()), std::string(test_data)); + + // Test copy assignment + JsonbField jf4; + jf4 = jf1; + ASSERT_NE(jf4.get_value(), nullptr); + ASSERT_NE(jf4.get_value(), jf1.get_value()); // Different memory locations + ASSERT_EQ(jf4.get_size(), jf1.get_size()); + ASSERT_EQ(std::string(jf4.get_value(), jf4.get_size()), + std::string(jf1.get_value(), jf1.get_size())); + + // Test move assignment + JsonbField jf5; + jf5 = std::move(jf4); + ASSERT_NE(jf5.get_value(), nullptr); + ASSERT_EQ(jf4.get_value(), nullptr); // jf4 should be empty after move + ASSERT_EQ(jf4.get_size(), 0); // jf4 size should be 0 after move + ASSERT_EQ(jf5.get_size(), test_size); + ASSERT_EQ(std::string(jf5.get_value(), jf5.get_size()), std::string(test_data)); + + // Test JsonbField with Field + Field field_jf = jf1; + ASSERT_EQ(field_jf.get_type(), Field::Types::JSONB); + ASSERT_NE(field_jf.get().get_value(), nullptr); + ASSERT_EQ(field_jf.get().get_size(), test_size); + ASSERT_EQ(std::string(field_jf.get().get_value(), + field_jf.get().get_size()), + std::string(test_data)); +} + +// Test for JsonbField I/O operations +TEST(VFieldTest, jsonb_field_io) { + // Prepare a JsonbField + const char* test_data = R"({ "key": "value" })"; + size_t test_size = strlen(test_data); + JsonbField original(test_data, test_size); + + // TEST 1: write_json_binary - From JsonbField to buffer + // Create a ColumnString to use with BufferWritable + ColumnString column_str; + + // Write the JsonbField to the buffer + { + BufferWritable buf(column_str); + write_json_binary(original, buf); + buf.commit(); // Important: commit the write operation + } + + // Verify data was written + ASSERT_GT(column_str.size(), 0); + + // Read the JsonbField back using BufferReadable + { + // Get the StringRef from ColumnString + StringRef str_ref = column_str.get_data_at(0); + + // Create a BufferReadable from StringRef + BufferReadable read_buf(str_ref); + + // Read the data back into a new JsonbField + JsonbField read_field; + read_json_binary(read_field, read_buf); + + // Verify the data + ASSERT_NE(read_field.get_value(), nullptr); + ASSERT_EQ(read_field.get_size(), original.get_size()); + ASSERT_EQ(std::string(read_field.get_value(), read_field.get_size()), + std::string(original.get_value(), original.get_size())); + } + + // Test with JsonbField as a Field and serde it + { + ColumnString field_column; + + // ser + { + BufferWritable field_buf(field_column); + write_json_binary(original, field_buf); + field_buf.commit(); + } + + // Verify field was written + ASSERT_GT(field_column.size(), 0); + + // de + { + StringRef field_str_ref = field_column.get_data_at(0); + BufferReadable read_field_buf(field_str_ref); + + // we can't use read_binary because of the JsonbField is not POD type + JsonbField jsonb_from_field; + read_json_binary(jsonb_from_field, read_field_buf); + Field f2 = jsonb_from_field; + + ASSERT_EQ(f2.get_type(), Field::Types::JSONB); + ASSERT_NE(f2.get().get_value(), nullptr); + ASSERT_EQ( + std::string(f2.get().get_value(), f2.get().get_size()), + std::string(test_data)); + } + } +} + } // namespace doris::vectorized