Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 38 additions & 6 deletions be/src/vec/data_types/serde/complex_type_deserialize_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include "common/status.h"
#include "vec/common/string_ref.h"
#include "vec/data_types/serde/data_type_serde.h"

Expand All @@ -32,23 +33,44 @@ struct ComplexTypeDeserializeUtil {
char delimiter = 0;
};

// Enhanced version with error handling
template <typename Func>
static std::vector<SplitResult> split_by_delimiter(StringRef& str, Func func) {
static Status split_by_delimiter(StringRef& str, char escape_char, Func func,
std::vector<SplitResult>& elements) {
char quote_char = 0;
int last_pos = 0;
int nested_level = 0;
bool has_quote = false;
char delimiter = 0;
std::vector<SplitResult> elements;
elements.clear(); //
for (int pos = 0; pos < str.size; ++pos) {
char c = str.data[pos];
// Idea from simdjson to handle escape characters
// Handle escape characters first
if (c == '\\') {
// count the number of consecutive backslashes
int backslash_count = 0;
while (pos < str.size && str.data[pos] == '\\') {
backslash_count++;
pos++;
}

// if the number of backslashes is odd, the next character is escaped
if (backslash_count % 2 == 1 && pos < str.size) {
pos++; // skip the escaped character
}
pos--; // backtrack, because the for loop will ++pos
continue;
}

// Handle quotes
if (c == '"' || c == '\'') {
if (!has_quote) {
quote_char = c;
has_quote = !has_quote;
has_quote = true;
} else if (has_quote && quote_char == c) {
quote_char = 0;
has_quote = !has_quote;
has_quote = false;
}
} else if (!has_quote && (c == '[' || c == '{')) {
++nested_level;
Expand All @@ -66,12 +88,22 @@ struct ComplexTypeDeserializeUtil {
}
}

elements.push_back({StringRef(str.data + last_pos, str.size - last_pos), delimiter});
// Validate final state
if (has_quote) {
return Status::InvalidArgument("Unclosed quote detected in string");
}

if (nested_level != 0) {
return Status::InvalidArgument("Unmatched brackets detected in string");
}

// Add the last element with no delimiter (or empty delimiter)
elements.push_back({StringRef(str.data + last_pos, str.size - last_pos), 0});

for (auto& e : elements) {
e.element = e.element.trim_whitespace();
}
return elements;
return Status::OK();
}

static bool is_null_string(const StringRef& str) {
Expand Down
10 changes: 8 additions & 2 deletions be/src/vec/data_types/serde/data_type_array_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ Status DataTypeArraySerDe::deserialize_one_cell_from_json(IColumn& column, Slice
quote_char = c;
has_quote = !has_quote;
} else if (has_quote && quote_char == c) {
// skip the quote character if it is escaped
if (idx > 0 && slice[idx - 1] == options.escape_char) {
continue;
}
quote_char = 0;
has_quote = !has_quote;
}
Expand Down Expand Up @@ -485,8 +489,10 @@ Status DataTypeArraySerDe::_from_string(StringRef& str, IColumn& column,
}
str = str.substring(1, str.size - 2); // remove '[' and ']'

auto split_result = ComplexTypeDeserializeUtil::split_by_delimiter(
str, [&](char c) { return c == options.collection_delim; });
std::vector<ComplexTypeDeserializeUtil::SplitResult> split_result;
RETURN_IF_ERROR(ComplexTypeDeserializeUtil::split_by_delimiter(
str, options.escape_char, [&](char c) { return c == options.collection_delim; },
split_result));

for (auto& e : split_result) {
RETURN_IF_ERROR(ComplexTypeDeserializeUtil::process_column<is_strict_mode>(
Expand Down
14 changes: 9 additions & 5 deletions be/src/vec/data_types/serde/data_type_map_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,13 @@ Status DataTypeMapSerDe::deserialize_one_cell_from_json(IColumn& column, Slice&
quote_char = c;
has_quote = !has_quote;
} else if (has_quote && quote_char == c) {
// skip the quote character if it is escaped
if (idx > 0 && slice[idx - 1] == options.escape_char) {
continue;
}
quote_char = 0;
has_quote = !has_quote;
}
} else if (c == '\\' && idx + 1 < slice_size) { //escaped
++idx;
} else if (!has_quote && (c == '[' || c == '{')) {
++nested_level;
} else if (!has_quote && (c == ']' || c == '}')) {
Expand Down Expand Up @@ -592,9 +594,11 @@ Status DataTypeMapSerDe::_from_string(StringRef& str, IColumn& column,
}
str = str.substring(1, str.size - 2); // remove '{' '}'

auto split_result = ComplexTypeDeserializeUtil::split_by_delimiter(str, [&](char c) {
return c == options.map_key_delim || c == options.collection_delim;
});
std::vector<ComplexTypeDeserializeUtil::SplitResult> split_result;
RETURN_IF_ERROR(ComplexTypeDeserializeUtil::split_by_delimiter(
str, options.escape_char,
[&](char c) { return c == options.map_key_delim || c == options.collection_delim; },
split_result));

// check syntax error
if (split_result.size() % 2 != 0) {
Expand Down
15 changes: 9 additions & 6 deletions be/src/vec/data_types/serde/data_type_struct_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ Status DataTypeStructSerDe::deserialize_one_cell_from_json(IColumn& column, Slic
bool key_added = false;
int idx = 0;
char quote_char = 0;

auto elem_size = elem_serdes_ptrs.size();
DCHECK_EQ(elem_size, elem_names.size());
int field_pos = 0;
Expand All @@ -131,11 +130,13 @@ Status DataTypeStructSerDe::deserialize_one_cell_from_json(IColumn& column, Slic
quote_char = c;
has_quote = !has_quote;
} else if (has_quote && quote_char == c) {
// skip the quote character if it is escaped
if (idx > 0 && slice[idx - 1] == options.escape_char) {
continue;
}
quote_char = 0;
has_quote = !has_quote;
}
} else if (c == '\\' && idx + 1 < slice_size) { //escaped
++idx;
} else if (!has_quote && (c == '[' || c == '{')) {
++nested_level;
} else if (!has_quote && (c == ']' || c == '}')) {
Expand Down Expand Up @@ -582,9 +583,11 @@ Status DataTypeStructSerDe::_from_string(StringRef& str, IColumn& column,
}
str = str.substring(1, str.size - 2); // remove '{' '}'

auto split_result = ComplexTypeDeserializeUtil::split_by_delimiter(str, [&](char c) {
return c == options.map_key_delim || c == options.collection_delim;
});
std::vector<ComplexTypeDeserializeUtil::SplitResult> split_result;
RETURN_IF_ERROR(ComplexTypeDeserializeUtil::split_by_delimiter(
str, options.escape_char,
[&](char c) { return c == options.map_key_delim || c == options.collection_delim; },
split_result));

const auto elem_size = elem_serdes_ptrs.size();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{"id": 1, "test_name": "double_quote_test", "arr": ["normal","with\"quotes\"","end\"quote"], "map_col": {"key1":"value1","key2":"with\"quotes\"","key3":"end\"quote"}, "struct_col": {"name":"test","des":"with\"quotes\"and\"more\"quotes"}}
{"id": 2, "test_name": "double_quote_map", "arr": null, "map_col": {"key1":"value1","key2":"with\"quotes\"","key3":"end\"quote"}, "struct_col": null}
{"id": 3, "test_name": "double_quote_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\"quotes\"and\"more\"quotes"}}
{"id": 4, "test_name": "backslash_test", "arr": ["normal","with\\backslash","end\\backslash"], "map_col": {"key1":"value1","key2":"with\\backslash","key3":"end\\backslash"}, "struct_col": {"name":"test","des":"with\\backslash\\and\\more\\backslashes"}}
{"id": 5, "test_name": "backslash_map", "arr": null, "map_col": {"key1":"value1","key2":"with\\backslash","key3":"end\\backslash"}, "struct_col": null}
{"id": 6, "test_name": "backslash_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\\backslash\\and\\more\\backslashes"}}
{"id": 7, "test_name": "newline_test", "arr": ["normal","with\nnewline","end\nnewline"], "map_col": {"key1":"value1","key2":"with\nnewline","key3":"end\nnewline"}, "struct_col": {"name":"test","des":"with\nnewline\nand\nmore\nnewlines"}}
{"id": 8, "test_name": "newline_map", "arr": null, "map_col": {"key1":"value1","key2":"with\nnewline","key3":"end\nnewline"}, "struct_col": null}
{"id": 9, "test_name": "newline_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\nnewline\nand\nmore\nnewlines"}}
{"id": 10, "test_name": "tab_test", "arr": ["normal","with\ttab","end\ttab"], "map_col": {"key1":"value1","key2":"with\ttab","key3":"end\ttab"}, "struct_col": {"name":"test","des":"with\ttab\tand\tmore\ttabs"}}
{"id": 11, "test_name": "tab_map", "arr": null, "map_col": {"key1":"value1","key2":"with\ttab","key3":"end\ttab"}, "struct_col": null}
{"id": 12, "test_name": "tab_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\ttab\tand\tmore\ttabs"}}
{"id": 13, "test_name": "carriage_return_test", "arr": ["normal","with\rcarriage","end\rcarriage"], "map_col": {"key1":"value1","key2":"with\rcarriage","key3":"end\rcarriage"}, "struct_col": {"name":"test","des":"with\rcarriage\rand\rmore\rcarriages"}}
{"id": 14, "test_name": "carriage_return_map", "arr": null, "map_col": {"key1":"value1","key2":"with\rcarriage","key3":"end\rcarriage"}, "struct_col": null}
{"id": 15, "test_name": "carriage_return_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\rcarriage\rand\rmore\rcarriages"}}
{"id": 16, "test_name": "backspace_test", "arr": ["normal","with\bbackspace","end\bbackspace"], "map_col": {"key1":"value1","key2":"with\bbackspace","key3":"end\bbackspace"}, "struct_col": {"name":"test","des":"with\bbackspace\band\bmore\bbackspaces"}}
{"id": 17, "test_name": "backspace_map", "arr": null, "map_col": {"key1":"value1","key2":"with\bbackspace","key3":"end\bbackspace"}, "struct_col": null}
{"id": 18, "test_name": "backspace_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\bbackspace\band\bmore\bbackspaces"}}
{"id": 19, "test_name": "form_feed_test", "arr": ["normal","with\fform","end\fform"], "map_col": {"key1":"value1","key2":"with\fform","key3":"end\fform"}, "struct_col": {"name":"test","des":"with\fform\fand\fmore\fforms"}}
{"id": 20, "test_name": "form_feed_map", "arr": null, "map_col": {"key1":"value1","key2":"with\fform","key3":"end\fform"}, "struct_col": null}
{"id": 21, "test_name": "form_feed_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\fform\fand\fmore\fforms"}}
{"id": 22, "test_name": "forward_slash_test", "arr": ["normal","with\/slash","end\/slash"], "map_col": {"key1":"value1","key2":"with\/slash","key3":"end\/slash"}, "struct_col": {"name":"test","des":"with\/slash\/and\/more\/slashes"}}
{"id": 23, "test_name": "forward_slash_map", "arr": null, "map_col": {"key1":"value1","key2":"with\/slash","key3":"end\/slash"}, "struct_col": null}
{"id": 24, "test_name": "forward_slash_struct", "arr": null, "map_col": null, "struct_col": {"name":"test","des":"with\/slash\/and\/more\/slashes"}}
{"id": 25, "test_name": "mixed_all_test", "arr": ["all\"escape\"chars\\here\nwith\tnewlines\rand\btabs\fand\/slashes"], "map_col": {"all":"\"escape\"chars\\here\nwith\tnewlines\rand\btabs\fand\/slashes"}, "struct_col": {"name":"mixed","des":"all\"escape\"chars\\here\nwith\tnewlines\rand\btabs\fand\/slashes"}}
{"id": 26, "test_name": "mixed_all_map", "arr": null, "map_col": {"all":"\"escape\"chars\\here\nwith\tnewlines\rand\btabs\fand\/slashes"}, "struct_col": null}
{"id": 27, "test_name": "mixed_all_struct", "arr": null, "map_col": null, "struct_col": {"name":"mixed","des":"all\"escape\"chars\\here\nwith\tnewlines\rand\btabs\fand\/slashes"}}
{"id": 28, "test_name": "empty_test", "arr": ["","\"","\\","\n","\t","\r","\b","\f","\/"], "map_col": {"empty":"","quote":"\"","backslash":"\\","newline":"\n","tab":"\t","carriage":"\r","backspace":"\b","form":"\f","slash":"\/"}, "struct_col": {"name":"empty","des":""}}
{"id": 29, "test_name": "empty_map", "arr": null, "map_col": {"empty":"","quote":"\"","backslash":"\\","newline":"\n","tab":"\t","carriage":"\r","backspace":"\b","form":"\f","slash":"\/"}, "struct_col": null}
{"id": 30, "test_name": "empty_struct", "arr": null, "map_col": null, "struct_col": {"name":"empty","des":""}}
{"id": 31, "test_name": "usecase_test", "arr": ["标准单人间","特惠房</div><div class=\"hotel_","标准双人房","特价房"], "map_col": {"特惠房</div><div class=\"hotel_":"特惠房</div><div class=\"hotel_"}, "struct_col": {"name":"特惠房</div><div class=\"hotel_","des":"标准单人间"}}
Loading
Loading