Skip to content

Commit b899103

Browse files
committed
Add base fulltext helpers (#24589)
1 parent 8a60628 commit b899103

File tree

11 files changed

+423
-29
lines changed

11 files changed

+423
-29
lines changed

ydb/core/base/fulltext.cpp

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#include "fulltext.h"
2+
#include <regex>
3+
4+
namespace NKikimr::NFulltext {
5+
6+
namespace {
7+
8+
Ydb::Table::FulltextIndexSettings::Layout ParseLayout(const TString& layout, TString& error) {
9+
if (layout == "flat")
10+
return Ydb::Table::FulltextIndexSettings::FLAT;
11+
else {
12+
error = TStringBuilder() << "Invalid layout: " << layout;
13+
return Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED;
14+
}
15+
};
16+
17+
Ydb::Table::FulltextIndexSettings::Tokenizer ParseTokenizer(const TString& tokenizer, TString& error) {
18+
if (tokenizer == "whitespace")
19+
return Ydb::Table::FulltextIndexSettings::WHITESPACE;
20+
else if (tokenizer == "standard")
21+
return Ydb::Table::FulltextIndexSettings::STANDARD;
22+
else if (tokenizer == "keyword")
23+
return Ydb::Table::FulltextIndexSettings::KEYWORD;
24+
else {
25+
error = TStringBuilder() << "Invalid tokenizer: " << tokenizer;
26+
return Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED;
27+
}
28+
};
29+
30+
i32 ParseInt32(const TString& name, const TString& value, TString& error) {
31+
i32 result = 0;
32+
if (!TryFromString(value, result) || result < 0) { // proto int32 fields with [(Ydb.value) = ">= 0"] annotation
33+
error = TStringBuilder() << "Invalid " << name << ": " << value;
34+
}
35+
return result;
36+
}
37+
38+
bool ParseBool(const TString& name, const TString& value, TString& error) {
39+
bool result = false;
40+
if (!TryFromString(value, result)) {
41+
error = TStringBuilder() << "Invalid " << name << ": " << value;
42+
}
43+
return result;
44+
}
45+
46+
// Note: written by llm, can be optimized a lot later
47+
TVector<TString> Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) {
48+
TVector<TString> tokens;
49+
switch (tokenizer) {
50+
case Ydb::Table::FulltextIndexSettings::WHITESPACE: {
51+
std::istringstream stream(text);
52+
TString token;
53+
while (stream >> token) {
54+
tokens.push_back(token);
55+
}
56+
break;
57+
}
58+
case Ydb::Table::FulltextIndexSettings::STANDARD: {
59+
std::regex word_regex(R"(\b\w+\b)"); // match alphanumeric words
60+
std::sregex_iterator it(text.begin(), text.end(), word_regex);
61+
std::sregex_iterator end;
62+
while (it != end) {
63+
tokens.push_back(it->str());
64+
++it;
65+
}
66+
break;
67+
}
68+
case Ydb::Table::FulltextIndexSettings::KEYWORD:
69+
tokens.push_back(text);
70+
break;
71+
default:
72+
Y_ENSURE(TStringBuilder() << "Invalid tokenizer: " << static_cast<int>(tokenizer));
73+
}
74+
75+
return tokens;
76+
}
77+
78+
bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
79+
if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
80+
error = "tokenizer should be set";
81+
return false;
82+
}
83+
84+
if (settings.has_language()) {
85+
error = "Unsupported language setting";
86+
return false;
87+
}
88+
89+
if (settings.use_filter_stopwords()) {
90+
error = "Unsupported use_filter_stopwords setting";
91+
return false;
92+
}
93+
94+
if (settings.use_filter_ngram()) {
95+
error = "Unsupported use_filter_ngram setting";
96+
return false;
97+
}
98+
if (settings.use_filter_edge_ngram()) {
99+
error = "Unsupported use_filter_edge_ngram setting";
100+
return false;
101+
}
102+
if (settings.has_filter_ngram_min_length()) {
103+
error = "Unsupported filter_ngram_min_length setting";
104+
return false;
105+
}
106+
if (settings.has_filter_ngram_max_length()) {
107+
error = "Unsupported filter_ngram_max_length setting";
108+
return false;
109+
}
110+
111+
if (settings.use_filter_length()) {
112+
error = "Unsupported use_filter_length setting";
113+
return false;
114+
}
115+
if (settings.has_filter_length_min()) {
116+
error = "Unsupported filter_length_min setting";
117+
return false;
118+
}
119+
if (settings.has_filter_length_max()) {
120+
error = "Unsupported filter_length_max setting";
121+
return false;
122+
}
123+
124+
return true;
125+
}
126+
}
127+
128+
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) {
129+
TVector<TString> tokens = Tokenize(text, settings.tokenizer());
130+
131+
if (settings.use_filter_lowercase()) {
132+
for (auto& token : tokens) {
133+
token.to_lower();
134+
}
135+
}
136+
137+
return tokens;
138+
}
139+
140+
bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error) {
141+
if (!settings.has_layout() || settings.layout() == Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED) {
142+
error = "layout should be set";
143+
return false;
144+
}
145+
146+
if (settings.columns().size() != 1) {
147+
error = TStringBuilder() << "fulltext index should have single column settings"
148+
<< " but have " << settings.columns().size() << " of them";
149+
return false;
150+
}
151+
152+
for (auto column : settings.columns()) {
153+
if (!column.has_column()) {
154+
error = "column should be set";
155+
return false;
156+
}
157+
if (!column.has_analyzers()) {
158+
error = "column analyzers should be set";
159+
return false;
160+
}
161+
if (!ValidateSettings(column.analyzers(), error)) {
162+
return false;
163+
}
164+
}
165+
166+
error = "";
167+
return true;
168+
}
169+
170+
Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVector<std::pair<TString, TString>>& settings, TString& error) {
171+
Ydb::Table::FulltextIndexSettings result;
172+
Ydb::Table::FulltextIndexSettings::Analyzers resultAnalyzers;
173+
174+
for (const auto& [name, value] : settings) {
175+
if (name == "layout") {
176+
result.set_layout(ParseLayout(value, error));
177+
} else if (name == "tokenizer") {
178+
resultAnalyzers.set_tokenizer(ParseTokenizer(value, error));
179+
} else if (name == "language") {
180+
resultAnalyzers.set_language(value);
181+
} else if (name == "use_filter_lowercase") {
182+
resultAnalyzers.set_use_filter_lowercase(ParseBool(name, value, error));
183+
} else if (name == "use_filter_stopwords") {
184+
resultAnalyzers.set_use_filter_stopwords(ParseBool(name, value, error));
185+
} else if (name == "use_filter_ngram") {
186+
resultAnalyzers.set_use_filter_ngram(ParseBool(name, value, error));
187+
} else if (name == "use_filter_edge_ngram") {
188+
resultAnalyzers.set_use_filter_edge_ngram(ParseBool(name, value, error));
189+
} else if (name == "filter_ngram_min_length") {
190+
resultAnalyzers.set_filter_ngram_min_length(ParseInt32(name, value, error));
191+
} else if (name == "filter_ngram_max_length") {
192+
resultAnalyzers.set_filter_ngram_max_length(ParseInt32(name, value, error));
193+
} else if (name == "use_filter_length") {
194+
resultAnalyzers.set_use_filter_length(ParseBool(name, value, error));
195+
} else if (name == "filter_length_min") {
196+
resultAnalyzers.set_filter_length_min(ParseInt32(name, value, error));
197+
} else if (name == "filter_length_max") {
198+
resultAnalyzers.set_filter_length_max(ParseInt32(name, value, error));
199+
} else {
200+
error = TStringBuilder() << "Unknown index setting: " << name;
201+
return result;
202+
}
203+
204+
if (error) {
205+
return result;
206+
}
207+
}
208+
209+
{
210+
// only single-columned index is supported for now
211+
auto columnAnalyzers = result.add_columns();
212+
columnAnalyzers->set_column(column);
213+
columnAnalyzers->mutable_analyzers()->CopyFrom(resultAnalyzers);
214+
}
215+
216+
ValidateSettings(result, error);
217+
218+
return result;
219+
}
220+
221+
222+
}

ydb/core/base/fulltext.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#pragma once
2+
3+
#include "defs.h"
4+
5+
#include <ydb/public/api/protos/ydb_table.pb.h>
6+
7+
namespace NKikimr::NFulltext {
8+
9+
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings);
10+
11+
bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error);
12+
Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVector<std::pair<TString, TString>>& values, TString& error);
13+
14+
}

ydb/core/base/kmeans_clusters.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,7 @@ bool ValidateSettings(const Ydb::Table::KMeansTreeSettings& settings, TString& e
493493
return false;
494494
}
495495

496+
error = "";
496497
return true;
497498
}
498499

@@ -525,6 +526,7 @@ bool ValidateSettings(const Ydb::Table::VectorIndexSettings& settings, TString&
525526
return false;
526527
}
527528

529+
error = "";
528530
return true;
529531
}
530532

ydb/core/base/ut/fulltext_ut.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#include "fulltext.h"
2+
3+
#include <library/cpp/testing/unittest/registar.h>
4+
5+
namespace NKikimr::NFulltext {
6+
7+
Y_UNIT_TEST_SUITE(NFulltext) {
8+
9+
Y_UNIT_TEST(ValidateSettings) {
10+
Ydb::Table::FulltextIndexSettings settings;
11+
TString error;
12+
13+
UNIT_ASSERT(!ValidateSettings(settings, error));
14+
UNIT_ASSERT_VALUES_EQUAL(error, "layout should be set");
15+
settings.set_layout(Ydb::Table::FulltextIndexSettings::FLAT);
16+
17+
UNIT_ASSERT(!ValidateSettings(settings, error));
18+
UNIT_ASSERT_VALUES_EQUAL(error, "fulltext index should have single column settings but have 0 of them");
19+
auto columnSettings = settings.add_columns();
20+
21+
UNIT_ASSERT(!ValidateSettings(settings, error));
22+
UNIT_ASSERT_VALUES_EQUAL(error, "column should be set");
23+
columnSettings->set_column("text");
24+
25+
UNIT_ASSERT(!ValidateSettings(settings, error));
26+
UNIT_ASSERT_VALUES_EQUAL(error, "column analyzers should be set");
27+
auto columnAnalyzers = columnSettings->mutable_analyzers();
28+
29+
UNIT_ASSERT(!ValidateSettings(settings, error));
30+
UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set");
31+
columnAnalyzers->set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD);
32+
33+
UNIT_ASSERT_C(ValidateSettings(settings, error), error);
34+
UNIT_ASSERT_VALUES_EQUAL(error, "");
35+
}
36+
37+
Y_UNIT_TEST(FillSettings) {
38+
TVector<std::pair<TString, TString>> list{
39+
{"layout", "flat"},
40+
{"tokenizer", "standard"},
41+
{"use_filter_lowercase", "true"}
42+
};
43+
44+
TString error;
45+
auto settings = FillSettings("text", list, error);
46+
UNIT_ASSERT_VALUES_EQUAL(error, "");
47+
48+
UNIT_ASSERT_EQUAL(settings.layout(), Ydb::Table::FulltextIndexSettings::FLAT);
49+
UNIT_ASSERT_VALUES_EQUAL(settings.columns().size(), 1);
50+
UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).column(), "text");
51+
UNIT_ASSERT_EQUAL(settings.columns().at(0).analyzers().tokenizer(), Ydb::Table::FulltextIndexSettings::STANDARD);
52+
UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).analyzers().use_filter_lowercase(), true);
53+
}
54+
55+
Y_UNIT_TEST(FillSettingsInvalid) {
56+
{
57+
TVector<std::pair<TString, TString>> list{
58+
{"asdf", "qwer"}
59+
};
60+
TString error;
61+
auto settings = FillSettings("text", list, error);
62+
UNIT_ASSERT_VALUES_EQUAL(error, "Unknown index setting: asdf");
63+
}
64+
65+
{
66+
TVector<std::pair<TString, TString>> list{
67+
{"layout", "flat"},
68+
{"tokenizer", "standard"},
69+
{"use_filter_lowercase", "asdf"}
70+
};
71+
TString error;
72+
auto settings = FillSettings("text", list, error);
73+
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid use_filter_lowercase: asdf");
74+
}
75+
76+
{
77+
TVector<std::pair<TString, TString>> list{
78+
{"layout", "flat"},
79+
};
80+
TString error;
81+
auto settings = FillSettings("text", list, error);
82+
UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set");
83+
}
84+
}
85+
86+
Y_UNIT_TEST(Analyze) {
87+
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
88+
TString text = "apple WaLLet spaced-dog";
89+
90+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
91+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced-dog"}));
92+
93+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD);
94+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced", "dog"}));
95+
96+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD);
97+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{text}));
98+
99+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
100+
analyzers.set_use_filter_lowercase(true);
101+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "wallet", "spaced-dog"}));
102+
}
103+
}
104+
105+
}

ydb/core/base/ut/ya.make

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@ PEERDIR(
99
)
1010

1111
SRCS(
12-
path_ut.cpp
1312
blobstorage_grouptype_ut.cpp
13+
fulltext_ut.cpp
1414
localdb_ut.cpp
1515
logoblob_ut.cpp
1616
memory_stats_ut.cpp
17-
statestorage_ut.cpp
17+
path_ut.cpp
1818
statestorage_guardian_impl_ut.cpp
19+
statestorage_ut.cpp
1920
table_index_ut.cpp
2021
)
2122

ydb/core/base/ya.make

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ SRCS(
2828
feature_flags.h
2929
feature_flags_service.cpp
3030
feature_flags_service.h
31+
fulltext.cpp
32+
fulltext.h
3133
group_stat.cpp
3234
group_stat.h
3335
hive.h

ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
152152
if (!context.SS->EnableFulltextIndex) {
153153
return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")};
154154
}
155+
TString msg;
156+
if (!NKikimr::NFulltext::ValidateSettings(indexDescription.GetFulltextIndexDescription().GetSettings(), msg)) {
157+
return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)};
158+
}
155159
break;
156160
}
157161
default:

0 commit comments

Comments
 (0)