diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 9a877f3726ca..4c7fb3264c33 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,5 +1,6 @@ #include "fulltext.h" -#include +#include +#include namespace NKikimr::NFulltext { @@ -45,35 +46,65 @@ namespace { return result; } - // Note: written by llm, can be optimized a lot later - TVector Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) { - TVector tokens; - switch (tokenizer) { - case Ydb::Table::FulltextIndexSettings::WHITESPACE: { - std::istringstream stream(text); - TString token; - while (stream >> token) { - tokens.push_back(token); + inline bool IsNonStandard(wchar32 c) { + return !IsAlphabetic(c) && !IsDecdigit(c); + } + + void Tokenize(const TString& text, TVector& tokens, auto isDelimiter) { + const unsigned char* ptr = (const unsigned char*)text.data(); + const unsigned char* end = ptr + text.size(); + + while (ptr < end) { + wchar32 symbol; + size_t symbolBytes = 0; + + while (ptr < end) { // skip delimiters + if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) { + tokens.clear(); + return; + } + if (!isDelimiter(symbol)) { + break; } + ptr += symbolBytes; + } + if (ptr >= end) { break; } - case Ydb::Table::FulltextIndexSettings::STANDARD: { - std::regex word_regex(R"(\b\w+\b)"); // match alphanumeric words - std::sregex_iterator it(text.begin(), text.end(), word_regex); - std::sregex_iterator end; - while (it != end) { - tokens.push_back(it->str()); - ++it; + + const unsigned char* tokenPtr = ptr; + while (ptr < end) { // read token + if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) { + tokens.clear(); + return; } - break; + if (isDelimiter(symbol)) { + break; + } + ptr += symbolBytes; } + tokens.emplace_back((const char*)tokenPtr, ptr - tokenPtr); + } + } + + + TVector Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) { + TVector tokens; + switch (tokenizer) { + case Ydb::Table::FulltextIndexSettings::WHITESPACE: + Tokenize(text, tokens, IsWhitespace); + break; + case Ydb::Table::FulltextIndexSettings::STANDARD: + Tokenize(text, tokens, IsNonStandard); + break; case Ydb::Table::FulltextIndexSettings::KEYWORD: - tokens.push_back(text); + if (UTF8Detect(text) != NotUTF8) { + tokens.push_back(text); + } break; default: Y_ENSURE(TStringBuilder() << "Invalid tokenizer: " << static_cast(tokenizer)); } - return tokens; } @@ -131,8 +162,8 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet TVector tokens = Tokenize(text, settings.tokenizer()); if (settings.use_filter_lowercase()) { - for (auto& token : tokens) { - token.to_lower(); + for (auto i : xrange(tokens.size())) { + tokens[i] = ToLowerUTF8(tokens[i]); } } diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index e7b2fff76499..2450d74e33a9 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -1,6 +1,7 @@ #include "fulltext.h" #include +#include namespace NKikimr::NFulltext { @@ -100,20 +101,68 @@ Y_UNIT_TEST_SUITE(NFulltext) { Y_UNIT_TEST(Analyze) { Ydb::Table::FulltextIndexSettings::Analyzers analyzers; - TString text = "apple WaLLet spaced-dog"; + TString text = "apple WaLLet spaced-dog_cat 0123,456@"; analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced-dog"})); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced-dog_cat", "0123,456@"})); analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced", "dog"})); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced", "dog", "cat", "0123", "456"})); analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{text})); analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); analyzers.set_use_filter_lowercase(true); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "wallet", "spaced-dog"})); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "wallet", "spaced-dog_cat", "0123,456@"})); + } + + Y_UNIT_TEST(AnalyzeRu) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + TString text = "Привет, это test123 и слово Ёлка ёль!"; + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"Привет,", "это", "test123", "и", "слово", "Ёлка", "ёль!"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"Привет", "это", "test123", "и", "слово", "Ёлка", "ёль"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{text})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + analyzers.set_use_filter_lowercase(true); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"привет", "это", "test123", "и", "слово", "ёлка", "ёль"})); + } + + Y_UNIT_TEST(AnalyzeInvalid) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + + TVector texts = { + "\xC2\x41", // Invalid continuation byte + "\xC0\x81", // Overlong encoding + "\x80", // Lone continuation byte + "\xF4\x90\x80\x80", // Outside Unicode range + "\xE3\x81", // Truncated (incomplete) + }; + + for (auto i : xrange(texts.size())) { + TString testCase = TStringBuilder() << "case #" << i; + auto& text = texts[i]; + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + UNIT_ASSERT_VALUES_EQUAL_C(Analyze(text, analyzers), (TVector{}), testCase); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL_C(Analyze(text, analyzers), (TVector{}), testCase); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); + UNIT_ASSERT_VALUES_EQUAL_C(Analyze(text, analyzers), (TVector{}), testCase); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); + analyzers.set_use_filter_lowercase(true); + UNIT_ASSERT_VALUES_EQUAL_C(Analyze(text, analyzers), (TVector{}), testCase); + } } }