diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h index 5aaba4a52650b7..160c2946497379 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h @@ -45,6 +45,14 @@ namespace doris::segment_v2 { +struct PathWithColumnAndType { + vectorized::PathInData path; + vectorized::ColumnPtr column; + vectorized::DataTypePtr type; +}; + +using PathsWithColumnAndType = std::vector; + // Reader for hierarchical data for variant, merge with root(sparse encoded columns) class HierarchicalDataReader : public ColumnIterator { public: diff --git a/be/src/vec/columns/column_variant.cpp b/be/src/vec/columns/column_variant.cpp index 62a134887609c2..432b1327374225 100644 --- a/be/src/vec/columns/column_variant.cpp +++ b/be/src/vec/columns/column_variant.cpp @@ -861,11 +861,7 @@ void ColumnVariant::try_insert(const Field& field) { } const auto& object = field.get(); size_t old_size = size(); - for (const auto& [key_str, value] : object) { - PathInData key; - if (!key_str.empty()) { - key = PathInData(key_str); - } + for (const auto& [key, value] : object) { if (!has_subcolumn(key)) { bool succ = add_sub_column(key, old_size); if (!succ) { @@ -958,7 +954,7 @@ void ColumnVariant::get(size_t n, Field& res) const { entry->data.get(n, field); // Notice: we treat null as empty field, since we do not distinguish null and empty for Variant type. if (field.get_type() != PrimitiveType::TYPE_NULL) { - object.try_emplace(entry->path.get_path(), field); + object.try_emplace(entry->path, field); } } if (object.empty()) { diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h index 27ae9a7f430c53..bca1273cdc2f62 100644 --- a/be/src/vec/core/field.h +++ b/be/src/vec/core/field.h @@ -40,6 +40,7 @@ #include "util/quantile_state.h" #include "vec/common/uint128.h" #include "vec/core/types.h" +#include "vec/json/path_in_data.h" namespace doris { template @@ -82,7 +83,7 @@ struct Map : public FieldVector { using FieldVector::FieldVector; }; -using VariantMap = std::map; +using VariantMap = std::map; //TODO: rethink if we really need this? it only save one pointer from std::string // not POD type so could only use read/write_json_binary instead of read/write_binary diff --git a/be/src/vec/data_types/data_type_variant.cpp b/be/src/vec/data_types/data_type_variant.cpp index 3625f895c2808a..57e9e956b293b9 100644 --- a/be/src/vec/data_types/data_type_variant.cpp +++ b/be/src/vec/data_types/data_type_variant.cpp @@ -70,6 +70,7 @@ int64_t DataTypeVariant::get_uncompressed_serialized_bytes(const IColumn& column } PColumnMeta column_meta_pb; column_meta_pb.set_name(entry->path.get_path()); + entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not used here*/); type->to_pb_column_meta(&column_meta_pb); std::string meta_binary; column_meta_pb.SerializeToString(&meta_binary); @@ -113,6 +114,7 @@ char* DataTypeVariant::serialize(const IColumn& column, char* buf, int be_exec_v ++num_of_columns; PColumnMeta column_meta_pb; column_meta_pb.set_name(entry->path.get_path()); + entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not used here*/); type->to_pb_column_meta(&column_meta_pb); std::string meta_binary; column_meta_pb.SerializeToString(&meta_binary); @@ -173,11 +175,15 @@ const char* DataTypeVariant::deserialize(const char* buf, MutableColumnPtr* colu MutableColumnPtr sub_column = type->create_column(); buf = type->deserialize(buf, &sub_column, be_exec_version); - // add subcolumn to column_object PathInData key; - if (!column_meta_pb.name().empty()) { + if (column_meta_pb.has_column_path()) { + // init from path pb + key.from_protobuf(column_meta_pb.column_path()); + } else if (!column_meta_pb.name().empty()) { + // init from name for compatible key = PathInData {column_meta_pb.name()}; } + // add subcolumn to column_object column_object->add_sub_column(key, std::move(sub_column), type); } size_t num_rows = 0; diff --git a/be/src/vec/data_types/serde/data_type_serde.cpp b/be/src/vec/data_types/serde/data_type_serde.cpp index dfc81c03fa2edc..12b9e4e2c472e7 100644 --- a/be/src/vec/data_types/serde/data_type_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_serde.cpp @@ -57,7 +57,8 @@ void DataTypeSerDe::convert_variant_map_to_rapidjson( continue; } rapidjson::Value key; - key.SetString(item.first.data(), cast_set(item.first.size())); + key.SetString(item.first.get_path().data(), + cast_set(item.first.get_path().size())); rapidjson::Value val; convert_field_to_rapidjson(item.second, val, allocator); if (val.IsNull() && item.first.empty()) { diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 897ba043b05e92..845aaea178de2d 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -28,6 +28,7 @@ #include #include +#include "runtime/primitive_type.h" #include "util/jsonb_writer.h" #include "vec/columns/column.h" #include "vec/common/string_ref.h" @@ -124,6 +125,13 @@ enum class ExtractType { struct ParseConfig { bool enable_flatten_nested = false; }; +/// Result of parsing of a document. +/// Contains all paths extracted from document +/// and values which are related to them. +struct ParseResult { + std::vector paths; + std::vector values; +}; template class JSONDataParser { public: diff --git a/be/src/vec/json/path_in_data.h b/be/src/vec/json/path_in_data.h index 8d94b02f37ac2b..769a4b186c7777 100644 --- a/be/src/vec/json/path_in_data.h +++ b/be/src/vec/json/path_in_data.h @@ -29,11 +29,7 @@ #include #include "gen_cpp/segment_v2.pb.h" -#include "vec/columns/column.h" #include "vec/common/uint128.h" -#include "vec/core/field.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type.h" namespace doris::vectorized { @@ -129,13 +125,6 @@ class PathInDataBuilder { size_t current_anonymous_array_level = 0; }; using PathsInData = std::vector; -/// Result of parsing of a document. -/// Contains all paths extracted from document -/// and values which are related to them. -struct ParseResult { - std::vector paths; - std::vector values; -}; struct PathInDataRef { const PathInData* ref; @@ -148,12 +137,4 @@ struct PathInDataRef { bool operator==(const PathInDataRef& other) const { return *this->ref == *other.ref; } }; -struct PathWithColumnAndType { - PathInData path; - ColumnPtr column; - DataTypePtr type; -}; - -using PathsWithColumnAndType = std::vector; - } // namespace doris::vectorized diff --git a/be/test/vec/columns/column_object_test.cpp b/be/test/vec/columns/column_object_test.cpp index 056e1ae4a2f5ff..8939fe75983632 100644 --- a/be/test/vec/columns/column_object_test.cpp +++ b/be/test/vec/columns/column_object_test.cpp @@ -21,6 +21,7 @@ #include "vec/columns/column_variant.h" #include "vec/columns/common_column_test.h" +#include "vec/json/path_in_data.h" namespace doris::vectorized { @@ -196,11 +197,11 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) { Field result1; dst_column->get(0, result1); - EXPECT_EQ(result1.get().at("").get(), 123); + EXPECT_EQ(result1.get().at({}).get(), 123); Field result2; dst_column->get(1, result2); - EXPECT_EQ(result2.get().at("").get(), 456); + EXPECT_EQ(result2.get().at({}).get(), 456); } // Test case 2: Insert from scalar variant source to non-empty destination of same type @@ -237,9 +238,9 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) { dst_column->get(1, result2); dst_column->get(2, result3); - EXPECT_EQ(result1.get().at("").get(), 789); - EXPECT_EQ(result2.get().at("").get(), 456); - EXPECT_EQ(result3.get().at("").get(), 123); + EXPECT_EQ(result1.get().at({}).get(), 789); + EXPECT_EQ(result2.get().at({}).get(), 456); + EXPECT_EQ(result3.get().at({}).get(), 123); } // Test case 3: Insert from non-scalar or different type source (fallback to try_insert) @@ -250,13 +251,13 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) { // Create a map with {"a": 123} Field field_map = Field::create_field(VariantMap()); auto& map1 = field_map.get(); - map1["a"] = Field::create_field(123); + map1[PathInData("a")] = Field::create_field(123); src_column->try_insert(field_map); // Create another map with {"b": "hello"} field_map = Field::create_field(VariantMap()); auto& map2 = field_map.get(); - map2["b"] = Field::create_field(String("hello")); + map2[PathInData("b")] = Field::create_field(String("hello")); src_column->try_insert(field_map); src_column->finalize(); @@ -285,8 +286,8 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) { const auto& result1_map = result1.get(); const auto& result2_map = result2.get(); - EXPECT_EQ(result1_map.at("b").get(), "hello"); - EXPECT_EQ(result2_map.at("a").get(), 123); + EXPECT_EQ(result1_map.at(PathInData("b")).get(), "hello"); + EXPECT_EQ(result2_map.at(PathInData("a")).get(), 123); } } diff --git a/gensrc/proto/data.proto b/gensrc/proto/data.proto index 95fb522289ee5c..54cbed7f4270f5 100644 --- a/gensrc/proto/data.proto +++ b/gensrc/proto/data.proto @@ -65,6 +65,7 @@ message PColumnMeta { optional bool result_is_nullable = 6; optional string function_name = 7; optional int32 be_exec_version = 8; + optional segment_v2.ColumnPathInfo column_path = 9; } message PBlock { diff --git a/regression-test/data/variant_p0/column_name.out b/regression-test/data/variant_p0/column_name.out index 6ac882d29225d2..0f54df05d91076 100644 --- a/regression-test/data/variant_p0/column_name.out +++ b/regression-test/data/variant_p0/column_name.out @@ -37,13 +37,25 @@ UPPER CASE lower case \N \N \N -"" -"" + + 1234566 16 8888888 -"UPPER CASE" -"dkdkdkdkdkd" -"ooaoaaaaaaa" -"xmxxmmmmmm" +UPPER CASE +dkdkdkdkdkd +ooaoaaaaaaa +xmxxmmmmmm + +-- !sql_cnt_1 -- +128 + +-- !sql_cnt_2 -- +128 + +-- !sql_cnt_3 -- +128 + +-- !sql_cnt_4 -- +128 diff --git a/regression-test/suites/variant_p0/column_name.groovy b/regression-test/suites/variant_p0/column_name.groovy index 7962112ff75f4d..7cf7fe198b154e 100644 --- a/regression-test/suites/variant_p0/column_name.groovy +++ b/regression-test/suites/variant_p0/column_name.groovy @@ -25,7 +25,7 @@ suite("regression_test_variant_column_name", "variant_type"){ ) DUPLICATE KEY(`k`) DISTRIBUTED BY HASH(k) BUCKETS 1 - properties("replication_num" = "1", "disable_auto_compaction" = "true"); + properties("replication_num" = "1", "disable_auto_compaction" = "false"); """ sql """insert into ${table_name} values (1, '{"中文" : "中文", "\\\u4E2C\\\u6587": "unicode"}')""" @@ -61,7 +61,18 @@ suite("regression_test_variant_column_name", "variant_type"){ sql """insert into var_column_name values (7, '{"": 1234566}')""" sql """insert into var_column_name values (7, '{"": 8888888}')""" - qt_sql "select Tags[''] from var_column_name order by cast(Tags[''] as string)" + qt_sql "select cast(Tags[''] as text) from var_column_name order by cast(Tags[''] as string)" + + // name with `.` + sql "truncate table var_column_name" + sql """insert into var_column_name values (7, '{"a.b": "UPPER CASE", "a.c": "lower case", "a" : {"b" : 123}, "a" : {"c" : 456}}')""" + for (int i = 0; i < 7; i++) { + sql """insert into var_column_name select * from var_column_name""" + } + qt_sql_cnt_1 "select count(Tags['a.b']) from var_column_name" + qt_sql_cnt_2 "select count(Tags['a.c']) from var_column_name" + qt_sql_cnt_3 "select count(Tags['a']['b']) from var_column_name" + qt_sql_cnt_4 "select count(Tags['a']['c']) from var_column_name" try { sql """insert into var_column_name values (7, '{"": "UPPER CASE", "": "lower case"}')"""