Skip to content

Commit 860b84d

Browse files
authored
Merge 3555e1f into d43cdee
2 parents d43cdee + 3555e1f commit 860b84d

File tree

7 files changed

+126
-5
lines changed

7 files changed

+126
-5
lines changed

ydb/core/base/fulltext.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,28 @@ namespace {
172172
return false;
173173
}
174174

175-
if (settings.has_language()) {
175+
if (settings.use_filter_snowball()) {
176+
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
177+
error = "cannot set use_filter_snowball with use_filter_ngam or use_filter_edge_ngram at the same time";
178+
return false;
179+
}
180+
181+
if (!settings.has_language()) {
182+
error = "language required when use_filter_snowball is set";
183+
return false;
184+
}
185+
186+
bool supportedLanguage = false;
187+
for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) {
188+
if (settings.language() == *ptr) {
189+
supportedLanguage = true;
190+
}
191+
}
192+
if (!supportedLanguage) {
193+
error = "language is not supported by snowball";
194+
return false;
195+
}
196+
} else if (settings.has_language()) {
176197
error = "Unsupported language setting";
177198
return false;
178199
}
@@ -246,7 +267,7 @@ namespace {
246267
}
247268
}
248269

249-
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) {
270+
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer) {
250271
TVector<TString> tokens = Tokenize(text, settings.tokenizer());
251272

252273
if (settings.use_filter_lowercase()) {
@@ -268,6 +289,20 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
268289
}), tokens.end());
269290
}
270291

292+
if (settings.use_filter_snowball()) {
293+
Y_ASSERT(stemmer);
294+
for (auto& token : tokens) {
295+
const sb_symbol* stemmed = sb_stemmer_stem(
296+
stemmer,
297+
reinterpret_cast<const sb_symbol*>(token.data()),
298+
token.size()
299+
);
300+
301+
const size_t resultLength = sb_stemmer_length(stemmer);
302+
token = std::string(reinterpret_cast<const char*>(stemmed), resultLength);
303+
}
304+
}
305+
271306
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
272307
TVector<TString> ngrams;
273308
for (const auto& token : tokens) {
@@ -367,6 +402,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam
367402
analyzers->set_filter_length_min(ParseInt32(name, value, error));
368403
} else if (nameLower == "filter_length_max") {
369404
analyzers->set_filter_length_max(ParseInt32(name, value, error));
405+
} else if (nameLower == "use_filter_snowball") {
406+
analyzers->set_use_filter_snowball(ParseBool(name, value, error));
370407
} else {
371408
error = TStringBuilder() << "Unknown index setting: " << name;
372409
return false;

ydb/core/base/fulltext.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,18 @@
44

55
#include <ydb/public/api/protos/ydb_table.pb.h>
66

7+
#include <contrib/libs/snowball/include/libstemmer.h>
8+
79
namespace NKikimr::NFulltext {
810

9-
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings);
11+
struct TStemmerDeleter {
12+
void operator()(struct sb_stemmer* stemmer) {
13+
sb_stemmer_delete(stemmer);
14+
}
15+
};
16+
using TStemmerPtr = std::unique_ptr<struct sb_stemmer, TStemmerDeleter>;
17+
18+
TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer = nullptr);
1019

1120
bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error);
1221
bool ValidateColumnsMatches(const TVector<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error);

ydb/core/base/ut/fulltext_ut.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,27 @@ Y_UNIT_TEST_SUITE(NFulltext) {
270270
analyzers.set_filter_ngram_max_length(3);
271271
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "это", "те", "тек"}));
272272
}
273+
274+
Y_UNIT_TEST(AnalyzeFilterSnowball) {
275+
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
276+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
277+
278+
{
279+
const TString text = "машины ездят по дорогам исправно";
280+
analyzers.set_use_filter_snowball(true);
281+
analyzers.set_language("russian");
282+
TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr));
283+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector<TString>{"машин", "езд", "по", "дорог", "исправн"}));
284+
}
285+
286+
{
287+
const TString text = "cars are driving properly on the roads";
288+
analyzers.set_use_filter_snowball(true);
289+
analyzers.set_language("english");
290+
TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr));
291+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector<TString>{"car", "are", "drive", "proper", "on", "the", "road"}));
292+
}
293+
}
273294
}
274295

275296
}

ydb/core/base/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ SRCS(
8888
)
8989

9090
PEERDIR(
91+
contrib/libs/snowball
9192
ydb/library/actors/core
9293
ydb/library/actors/helpers
9394
ydb/library/actors/interconnect

ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,23 @@ void AddIndexCovered(NQuery::TQueryClient& db) {
9191
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
9292
}
9393

94+
void AddIndexSnowball(NQuery::TQueryClient& db) {
95+
TString query = R"sql(
96+
ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx
97+
GLOBAL USING fulltext
98+
ON (Text)
99+
WITH (
100+
layout=flat,
101+
tokenizer=standard,
102+
use_filter_lowercase=true,
103+
use_filter_snowball=true,
104+
language=english
105+
)
106+
)sql";
107+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
108+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
109+
}
110+
94111
TResultSet ReadIndex(NQuery::TQueryClient& db) {
95112
TString query = R"sql(
96113
SELECT * FROM `/Root/Texts/fulltext_idx/indexImplTable`;
@@ -224,6 +241,31 @@ Y_UNIT_TEST(AddIndexEdgeNGram) {
224241
])", NYdb::FormatResultSetYson(index));
225242
}
226243

244+
Y_UNIT_TEST(AddIndexSnowball) {
245+
auto kikimr = Kikimr();
246+
auto db = kikimr.GetQueryClient();
247+
248+
CreateTexts(db);
249+
UpsertTexts(db);
250+
AddIndexSnowball(db);
251+
const auto index = ReadIndex(db);
252+
CompareYson(R"([
253+
[[[100u];"anim"];
254+
[[100u];"cat"];
255+
[[200u];"cat"];
256+
[[300u];"cat"];
257+
[[100u];"chase"];
258+
[[200u];"chase"];
259+
[[200u];"dog"];
260+
[[400u];"dog"];
261+
[[400u];"fox"];
262+
[[300u];"love"];
263+
[[400u];"love"];
264+
[[100u];"small"];
265+
[[200u];"small"]
266+
])", NYdb::FormatResultSetYson(index));
267+
}
268+
227269
Y_UNIT_TEST(InsertRow) {
228270
auto kikimr = Kikimr();
229271
auto db = kikimr.GetQueryClient();

ydb/core/tx/datashard/build_index/fulltext.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IA
3636
TTags ScanTags;
3737
TString TextColumn;
3838
Ydb::Table::FulltextIndexSettings::Analyzers TextAnalyzers;
39+
TStemmerPtr Stemmer;
3940

4041
TBatchRowsUploader Uploader;
4142
TBufferData* UploadBuf = nullptr;
@@ -65,6 +66,9 @@ class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IA
6566
Y_ENSURE(Request.settings().columns().size() == 1);
6667
TextColumn = Request.settings().columns().at(0).column();
6768
TextAnalyzers = Request.settings().columns().at(0).analyzers();
69+
if (TextAnalyzers.use_filter_snowball()) {
70+
Stemmer = TStemmerPtr(sb_stemmer_new(TextAnalyzers.language().c_str(), nullptr));
71+
}
6872

6973
auto tags = GetAllTags(table);
7074
auto types = GetAllTypes(table);
@@ -142,7 +146,7 @@ class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IA
142146
TVector<TCell> uploadValue(::Reserve(Request.GetDataColumns().size()));
143147

144148
TString text((*row).at(0).AsBuf());
145-
auto tokens = Analyze(text, TextAnalyzers);
149+
auto tokens = Analyze(text, TextAnalyzers, Stemmer.get());
146150
for (const auto& token : tokens) {
147151
uploadKey.clear();
148152
uploadKey.push_back(TCell(token));

ydb/public/api/protos/ydb_table.proto

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ message FulltextIndexSettings {
174174
// See Tokenizer enum
175175
optional Tokenizer tokenizer = 1;
176176

177-
// Language used for language-sensitive operations like stopword filtering
177+
// Language used for language-sensitive operations like stopword filtering and stemming
178178
// Example: language = "english"
179179
// By default is not specified and no language-specific logic is applied
180180
optional string language = 2;
@@ -229,6 +229,13 @@ message FulltextIndexSettings {
229229
// Maximum token length to keep (inclusive)
230230
// Must be used with use_filter_length
231231
optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
232+
233+
// Wether to apply snowball stemming to each token
234+
// Must be used with language option
235+
// Example: language = "english"
236+
// Tokens: ["cars", "beautifully", "conspirated"]
237+
// Output: ["car", "beauti", "conspir"]
238+
optional bool use_filter_snowball = 140;
232239
}
233240

234241
// Represents text analyzers settings for a specific column

0 commit comments

Comments
 (0)