Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 40 additions & 15 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,6 @@ namespace doris::vectorized {
namespace {

DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) {
if (type == ColumnObject::MOST_COMMON_TYPE_ID) {
// JSONB type MUST NOT wrapped in ARRAY column, it should be top level.
// So we ignored num_dimensions.
return is_nullable ? make_nullable(std::make_shared<ColumnObject::MostCommonType>())
: std::make_shared<ColumnObject::MostCommonType>();
}
DataTypePtr result = DataTypeFactory::instance().create_data_type(type, is_nullable);
for (size_t i = 0; i < num_dimensions; ++i) {
result = std::make_shared<DataTypeArray>(result);
Expand Down Expand Up @@ -945,11 +939,12 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) const {
res = Null();
return;
}
if (is_finalized()) {
if (least_common_type.get_base_type_id() == TypeIndex::JSONB) {
// JsonbFiled is special case
res = JsonbField();
}

// JSONB is a special type, it's not a scalar type, we need to handle it specially
// 1. we try to get the JSONB Field from ColumnString which has no JSONB type info
// 2. Array of JSONB is a special type, we get from ColumnArray of ColumnString, should convert from string Field to JSONB Field
if (is_finalized() && least_common_type.get_base_type_id() != TypeIndex::JSONB) {
// common type to get the field value
get_finalized_column().get(n, res);
return;
}
Expand All @@ -965,11 +960,20 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) const {
const auto& part = data[i];
const auto& part_type = data_types[i];
if (ind < part->size()) {
res = vectorized::remove_nullable(part_type)->get_default();
auto non_nullable_type = vectorized::remove_nullable(part_type);
bool is_nested_array_of_jsonb =
non_nullable_type->equals(*NESTED_TYPE_AS_ARRAY_OF_JSONB);

res = non_nullable_type->get_default();
part->get(ind, res);
Field new_field;
convert_field_to_type(res, *least_common_type.get(), &new_field);
res = new_field;

if (is_nested_array_of_jsonb) {
convert_array_string_to_array_jsonb(res);
} else {
Field new_field;
convert_field_to_type(res, *least_common_type.get(), &new_field);
res = new_field;
}
return;
}

Expand Down Expand Up @@ -1861,6 +1865,10 @@ const DataTypePtr ColumnObject::NESTED_TYPE = std::make_shared<vectorized::DataT
std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
std::make_shared<vectorized::DataTypeObject>())));

const DataTypePtr ColumnObject::NESTED_TYPE_AS_ARRAY_OF_JSONB =
std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
std::make_shared<vectorized::DataTypeJsonb>()));

DataTypePtr ColumnObject::get_root_type() const {
return subcolumns.get_root()->data.get_least_common_type();
}
Expand Down Expand Up @@ -2055,4 +2063,21 @@ bool ColumnObject::try_insert_default_from_nested(const Subcolumns::NodePtr& ent
return true;
}

void ColumnObject::Subcolumn::convert_array_string_to_array_jsonb(Field& array_field) {
if (array_field.is_null()) {
return;
}
if (array_field.get_type() != Field::Types::Array) {
return;
}
Field converted_res = Array();
for (auto& item : array_field.get<Array&>()) {
DCHECK(item.get_type() == Field::Types::String);
auto& string_item = item.get<String&>();
Field jsonb_item = JsonbField(string_item.c_str(), string_item.size());
converted_res.get<Array&>().emplace_back(std::move(jsonb_item));
}
array_field = std::move(converted_res);
}

} // namespace doris::vectorized
6 changes: 6 additions & 0 deletions be/src/vec/columns/column_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
// Nullable(Array(Nullable(Object)))
const static DataTypePtr NESTED_TYPE;
// Array(Nullable(Jsonb))
const static DataTypePtr NESTED_TYPE_AS_ARRAY_OF_JSONB;

// Finlize mode for subcolumns, write mode will estimate which subcolumns are sparse columns(too many null values inside column),
// merge and encode them into a shared column in root column. Only affects in flush block to segments.
// Otherwise read mode should be as default mode.
Expand Down Expand Up @@ -177,6 +180,9 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {

void add_new_column_part(DataTypePtr type);

/// Converts Array<String> to Array<JsonbField> for special case handling
static void convert_array_string_to_array_jsonb(Field& array_field);

friend class ColumnObject;

private:
Expand Down
11 changes: 7 additions & 4 deletions be/src/vec/data_types/convert_field_to_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "common/exception.h"
#include "common/status.h"
#include "util/bitmap_value.h"
#include "util/jsonb_document.h"
#include "util/jsonb_writer.h"
#include "vec/common/field_visitors.h"
#include "vec/common/typeid_cast.h"
Expand Down Expand Up @@ -111,6 +112,11 @@ class FieldVisitorToJsonb : public StaticVisitor<void> {
writer->writeString(x);
writer->writeEndString();
}
void operator()(const JsonbField& x, JsonbWriter* writer) const {
JsonbDocument* doc;
THROW_IF_ERROR(JsonbDocument::checkAndCreateDocument(x.get_value(), x.get_size(), &doc));
writer->writeValue(doc->getValue());
}
void operator()(const Array& x, JsonbWriter* writer) const;

void operator()(const Tuple& x, JsonbWriter* writer) const {
Expand Down Expand Up @@ -146,9 +152,6 @@ class FieldVisitorToJsonb : public StaticVisitor<void> {
void operator()(const Map& x, JsonbWriter* writer) const {
throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, "Not implemeted");
}
void operator()(const JsonbField& x, JsonbWriter* writer) const {
throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, "Not implemeted");
}
};

void FieldVisitorToJsonb::operator()(const Array& x, JsonbWriter* writer) const {
Expand Down Expand Up @@ -316,4 +319,4 @@ void convert_field_to_type(const Field& from_value, const IDataType& to_type, Fi
return convert_field_to_typeImpl(from_value, to_type, from_type_hint, to);
}
}
} // namespace doris::vectorized
} // namespace doris::vectorized
30 changes: 21 additions & 9 deletions be/src/vec/functions/function_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ struct ConvertNothingToJsonb {
}
};

template <TypeIndex type_index, typename ColumnType>
template <TypeIndex type_index, typename ColumnType, typename ToDataType>
struct ConvertImplFromJsonb {
static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
const size_t result, size_t input_rows_count) {
Expand Down Expand Up @@ -897,6 +897,18 @@ struct ConvertImplFromJsonb {
res[i] = 0;
continue;
}

// if value is string, convert by parse, otherwise the result is null if ToDataType is not string
if (value->isString()) {
const auto* blob = static_cast<const JsonbBlobVal*>(value);
const auto& data = blob->getBlob();
size_t len = blob->getBlobLen();
ReadBuffer rb((char*)(data), len);
bool parsed = try_parse_impl<ToDataType>(res[i], rb, context);
null_map[i] = !parsed;
continue;
}

if constexpr (type_index == TypeIndex::UInt8) {
// cast from json value to boolean type
if (value->isTrue()) {
Expand Down Expand Up @@ -1991,22 +2003,22 @@ class FunctionCast final : public IFunctionBase {
bool jsonb_string_as_string) const {
switch (to_type->get_type_id()) {
case TypeIndex::UInt8:
return &ConvertImplFromJsonb<TypeIndex::UInt8, ColumnUInt8>::execute;
return &ConvertImplFromJsonb<TypeIndex::UInt8, ColumnUInt8, DataTypeUInt8>::execute;
case TypeIndex::Int8:
return &ConvertImplFromJsonb<TypeIndex::Int8, ColumnInt8>::execute;
return &ConvertImplFromJsonb<TypeIndex::Int8, ColumnInt8, DataTypeInt8>::execute;
case TypeIndex::Int16:
return &ConvertImplFromJsonb<TypeIndex::Int16, ColumnInt16>::execute;
return &ConvertImplFromJsonb<TypeIndex::Int16, ColumnInt16, DataTypeInt16>::execute;
case TypeIndex::Int32:
return &ConvertImplFromJsonb<TypeIndex::Int32, ColumnInt32>::execute;
return &ConvertImplFromJsonb<TypeIndex::Int32, ColumnInt32, DataTypeInt32>::execute;
case TypeIndex::Int64:
return &ConvertImplFromJsonb<TypeIndex::Int64, ColumnInt64>::execute;
return &ConvertImplFromJsonb<TypeIndex::Int64, ColumnInt64, DataTypeInt64>::execute;
case TypeIndex::Int128:
return &ConvertImplFromJsonb<TypeIndex::Int128, ColumnInt128>::execute;
return &ConvertImplFromJsonb<TypeIndex::Int128, ColumnInt128, DataTypeInt128>::execute;
case TypeIndex::Float64:
return &ConvertImplFromJsonb<TypeIndex::Float64, ColumnFloat64>::execute;
return &ConvertImplFromJsonb<TypeIndex::Float64, ColumnFloat64,
DataTypeFloat64>::execute;
case TypeIndex::String:
if (!jsonb_string_as_string) {
// Conversion from String through parsing.
return &ConvertImplGenericToString::execute2;
} else {
return ConvertImplGenericFromJsonb::execute;
Expand Down
12 changes: 11 additions & 1 deletion be/src/vec/json/json_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,14 @@ void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext&
if (element.isObject()) {
traverseObject(element.getObject(), ctx);
} else if (element.isArray()) {
if (ctx.has_nested_in_flatten) {
throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
"Nesting of array in Nested array within variant subcolumns is "
"currently not supported.");
}
has_nested = false;
check_has_nested_object(element);
ctx.has_nested_in_flatten = has_nested && ctx.enable_flatten_nested;
if (has_nested && !ctx.enable_flatten_nested) {
// Parse nested arrays to JsonbField
JsonbWriter writer;
Expand All @@ -71,6 +77,8 @@ void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext&
} else {
traverseArray(element.getArray(), ctx);
}
// we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
ctx.has_nested_in_flatten = false;
} else {
ctx.paths.push_back(ctx.builder.get_parts());
ctx.values.push_back(getValueAsField(element));
Expand Down Expand Up @@ -137,6 +145,7 @@ template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
/// Traverse elements of array and collect an array of fields by each path.
ParseArrayContext array_ctx;
array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
array_ctx.total_size = array.size();
for (auto it = array.begin(); it != array.end(); ++it) {
traverseArrayElement(*it, array_ctx);
Expand All @@ -162,8 +171,9 @@ template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
ParseArrayContext& ctx) {
ParseContext element_ctx;
element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
traverse(element, element_ctx);
auto& [_, paths, values, flatten_nested] = element_ctx;
auto& [_, paths, values, flatten_nested, has_nested] = element_ctx;
size_t size = paths.size();
size_t keys_to_update = ctx.arrays_by_path.size();
for (size_t i = 0; i < size; ++i) {
Expand Down
2 changes: 2 additions & 0 deletions be/src/vec/json/json_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class JSONDataParser {
std::vector<PathInData::Parts> paths;
std::vector<Field> values;
bool enable_flatten_nested = false;
bool has_nested_in_flatten = false;
};
using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>;
Expand All @@ -157,6 +158,7 @@ class JSONDataParser {
size_t total_size = 0;
PathToArray arrays_by_path;
KeyToSizes nested_sizes_by_key;
bool has_nested_in_flatten = false;
};
void traverse(const Element& element, ParseContext& ctx);
void traverseObject(const JSONObject& object, ParseContext& ctx);
Expand Down
130 changes: 130 additions & 0 deletions be/test/vec/columns/column_object_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,134 @@ TEST_F(ColumnObjectTest, test_pop_back_multiple_types) {
EXPECT_EQ(subcolumn.get_least_common_type()->get_name(), "Nothing");
}

TEST_F(ColumnObjectTest, test_nested_array_of_jsonb_get) {
// Test case: Create a ColumnObject with subcolumn type Array<JSONB>

// Create a ColumnObject with subcolumns
auto variant_column = ColumnObject::create(true);

// Add subcolumn with path "nested.array"
variant_column->add_sub_column(PathInData("nested.array"), 0);

// Get the subcolumn and manually set its type to Array<JSONB>
auto* subcolumn = variant_column->get_subcolumn(PathInData("nested.array"));
ASSERT_NE(subcolumn, nullptr);

// Create test data: Array of strings
Field array_of_strings = Array();

// Add string elements to the array
std::string test_data1 = R"("a")";
std::string test_data2 = R"(b)";

array_of_strings.get<Array&>().emplace_back(test_data1);
array_of_strings.get<Array&>().emplace_back(test_data2);

// Insert the array field into the subcolumn
subcolumn->insert(array_of_strings);

// Test 1: the column and test get method
{
EXPECT_TRUE(variant_column->is_finalized());
// check the subcolumn get method
Field result;
EXPECT_NO_THROW(subcolumn->get(0, result));

// Verify the result is still an array
EXPECT_EQ(result.get_type(), doris::vectorized::Field::Types::Array);

const auto& result_array = result.get<const Array&>();
EXPECT_EQ(result_array.size(), 2);

// Check that all elements are JSONB fields
for (const auto& item : result_array) {
EXPECT_EQ(item.get_type(), doris::vectorized::Field::Types::String);
}

// Verify string content is preserved
const auto& string1 = result_array[0].get<const String&>();
const auto& string2 = result_array[1].get<const String&>();

EXPECT_EQ(string1, R"("a")"); // "\"a\""
EXPECT_EQ(string2, R"(b)"); // "b"
}

// Test 2: Test with a row of different type of array to test the subcolumn get method
{
// Add another row with different int array
Field int_array = Array();
int_array.get<Array&>().push_back(1);
int_array.get<Array&>().push_back(2);
int_array.get<Array&>().push_back(3);

// and we should add more data to the subcolumn column
subcolumn->insert(int_array);

EXPECT_FALSE(variant_column->is_finalized());
// check the subcolumn get method
Field result;
EXPECT_NO_THROW(subcolumn->get(1, result));
EXPECT_EQ(result.get_type(), doris::vectorized::Field::Types::Array);
const auto& result_array = result.get<const Array&>();
EXPECT_EQ(result_array.size(), 3);
EXPECT_EQ(result_array[0].get_type(), doris::vectorized::Field::Types::JSONB);
EXPECT_EQ(result_array[1].get_type(), doris::vectorized::Field::Types::JSONB);
EXPECT_EQ(result_array[2].get_type(), doris::vectorized::Field::Types::JSONB);

// check the first row Field is a string
Field result_string;
EXPECT_NO_THROW(subcolumn->get(0, result_string));
EXPECT_EQ(result_string.get_type(), doris::vectorized::Field::Types::Array);
const auto& result_string_array = result_string.get<const Array&>();
EXPECT_EQ(result_string_array.size(), 2);
EXPECT_EQ(result_string_array[0].get_type(), doris::vectorized::Field::Types::JSONB);
EXPECT_EQ(result_string_array[1].get_type(), doris::vectorized::Field::Types::JSONB);

// Finalize -> we should get the least common type of the subcolumn
variant_column->finalize();
EXPECT_TRUE(variant_column->is_finalized());
// we should get another subcolumn from the variant column
auto* subcolumn_finalized = variant_column->get_subcolumn(PathInData("nested.array"));
ASSERT_NE(subcolumn_finalized, nullptr);
// check the subcolumn_finalized get method
Field result1, result2;
EXPECT_NO_THROW(subcolumn_finalized->get(0, result1));
EXPECT_NO_THROW(subcolumn_finalized->get(1, result2));

// Verify both results are arrays
EXPECT_EQ(result1.get_type(), doris::vectorized::Field::Types::Array);
EXPECT_EQ(result2.get_type(), doris::vectorized::Field::Types::Array);

const auto& array1 = result1.get<const Array&>();
const auto& array2 = result2.get<const Array&>();

EXPECT_EQ(array1.size(), 2);
EXPECT_EQ(array2.size(), 3);

// Verify all elements are JSONB
for (const auto& item : array1) {
EXPECT_EQ(item.get_type(), doris::vectorized::Field::Types::JSONB);
}
for (const auto& item : array2) {
EXPECT_EQ(item.get_type(), doris::vectorized::Field::Types::JSONB);
}
}

// Test 4: Test with empty array
{
auto* subcolumn = variant_column->get_subcolumn(PathInData("nested.array"));
ASSERT_NE(subcolumn, nullptr);
Field empty_array_field = Array();
subcolumn->insert(empty_array_field);

EXPECT_TRUE(variant_column->is_finalized());
// check the subcolumn get method
Field result;
EXPECT_NO_THROW(subcolumn->get(2, result));
EXPECT_EQ(result.get_type(), doris::vectorized::Field::Types::Array);
const auto& result_array = result.get<const Array&>();
EXPECT_EQ(result_array.size(), 0);
}
}

} // namespace doris::vectorized
Loading
Loading