Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions be/src/vec/functions/function_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <ctype.h>
#include <math.h>
#include <re2/stringpiece.h>
#include <unicode/schriter.h>
#include <unicode/uchar.h>
#include <unicode/unistr.h>
#include <unicode/ustream.h>

Expand Down Expand Up @@ -512,8 +514,22 @@ struct NameToInitcap {
struct InitcapImpl {
static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
size_t offset_size = offsets.size();
res_offsets.resize(offsets.size());

const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(), data.size()});
if (is_ascii) {
impl_vectors_ascii(data, offsets, res_data, res_offsets);
} else {
impl_vectors_utf8(data, offsets, res_data, res_offsets);
}
return Status::OK();
}

static void impl_vectors_ascii(const ColumnString::Chars& data,
const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data,
ColumnString::Offsets& res_offsets) {
size_t offset_size = offsets.size();
memcpy_small_allow_read_write_overflow15(
res_offsets.data(), offsets.data(),
offset_size * sizeof(ColumnString::Offsets::value_type));
Expand All @@ -538,7 +554,40 @@ struct InitcapImpl {

start_index = end_index;
}
return Status::OK();
}

static void impl_vectors_utf8(const ColumnString::Chars& data,
const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data,
ColumnString::Offsets& res_offsets) {
std::string result;
for (int64_t i = 0; i < offsets.size(); ++i) {
const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
uint32_t size = offsets[i] - offsets[i - 1];
result.clear();
to_initcap_utf8(begin, size, result);
StringOP::push_value_string(result, i, res_data, res_offsets);
}
}

static void to_initcap_utf8(const char* data, uint32_t size, std::string& result) {
icu::StringPiece sp;
sp.set(data, size);
icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
unicode_str.toLower();
icu::UnicodeString output_str;
bool need_capitalize = true;
icu::StringCharacterIterator iter(unicode_str);
for (UChar32 ch = iter.first32(); ch != icu::CharacterIterator::DONE; ch = iter.next32()) {
if (!u_isalnum(ch)) {
need_capitalize = true;
} else if (need_capitalize) {
ch = u_toupper(ch);
need_capitalize = false;
}
output_str.append(ch);
}
output_str.toUTF8String(result);
}
};

Expand Down
2 changes: 2 additions & 0 deletions be/test/vec/function/function_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3103,6 +3103,8 @@ TEST(function_string_test, function_initcap) {
{{std::string("BC'S aaaaA'' 'S")}, std::string("Bc'S Aaaaa'' 'S")},
{{std::string("NULL")}, std::string("Null")},
{{Null()}, Null()},
{{std::string("GROSSE àstanbul , ÀÇAC123 ΣΟΦΟΣ")},
std::string("Grosse Àstanbul , Àçac123 Σοφος")},
{{std::string("HELLO, WORLD!")}, std::string("Hello, World!")},
{{std::string("HHHH+-1; asAAss__!")}, std::string("Hhhh+-1; Asaass__!")},
{{std::string("a,B,C,D")}, std::string("A,B,C,D")}};
Expand Down
3 changes: 3 additions & 0 deletions regression-test/data/function_p0/test_function_string.out
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ a
www.facebook.com
www.google.com /test?name=abc&age=20

-- !sql --
Grosse Àstanbul , Àçac123 Σοφος

11 changes: 11 additions & 0 deletions regression-test/suites/function_p0/test_function_string.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,15 @@ suite("test_function_string") {
qt_sql """
select parse_url(url, 'HOST') as host, parse_url(url, 'FILE') as file from test_parse_url order by id;
"""


sql """
set DEBUG_SKIP_FOLD_CONSTANT = true;
"""
qt_sql """
select initcap('GROSSE àstanbul , ÀÇAC123 ΣΟΦΟΣ');
"""
sql """
set DEBUG_SKIP_FOLD_CONSTANT = false;
"""
}
Loading
Loading