Skip to content

Commit 1f82ea2

Browse files
authored
Merge 49ed125 into b21a4b0
2 parents b21a4b0 + 49ed125 commit 1f82ea2

File tree

5 files changed

+104
-1
lines changed

5 files changed

+104
-1
lines changed

ydb/core/base/fulltext.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#include "fulltext.h"
2+
3+
#include <contrib/libs/snowball/include/libstemmer.h>
4+
25
#include <util/charset/utf8.h>
36
#include <util/generic/xrange.h>
47

@@ -172,7 +175,28 @@ namespace {
172175
return false;
173176
}
174177

175-
if (settings.has_language()) {
178+
if (settings.use_filter_snowball()) {
179+
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
180+
error = "cannot set use_filter_snowball with use_filter_ngam or use_filter_edge_ngram at the same time";
181+
return false;
182+
}
183+
184+
if (!settings.has_language()) {
185+
error = "language required when use_filter_snowball is set";
186+
return false;
187+
}
188+
189+
bool supportedLanguage = false;
190+
for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) {
191+
if (settings.language() == *ptr) {
192+
supportedLanguage = true;
193+
}
194+
}
195+
if (!supportedLanguage) {
196+
error = "language is not supported by snowball";
197+
return false;
198+
}
199+
} else if (settings.has_language()) {
176200
error = "Unsupported language setting";
177201
return false;
178202
}
@@ -268,6 +292,20 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
268292
}), tokens.end());
269293
}
270294

295+
if (settings.use_filter_snowball()) {
296+
struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr);
297+
for (auto& token : tokens) {
298+
const sb_symbol* stemmed = sb_stemmer_stem(
299+
stemmer,
300+
reinterpret_cast<const sb_symbol*>(token.data()),
301+
token.size()
302+
);
303+
304+
const size_t resultLength = sb_stemmer_length(stemmer);
305+
token = std::string(reinterpret_cast<const char*>(stemmed), resultLength);
306+
}
307+
}
308+
271309
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
272310
TVector<TString> ngrams;
273311
for (const auto& token : tokens) {
@@ -367,6 +405,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam
367405
analyzers->set_filter_length_min(ParseInt32(name, value, error));
368406
} else if (nameLower == "filter_length_max") {
369407
analyzers->set_filter_length_max(ParseInt32(name, value, error));
408+
} else if (nameLower == "use_filter_snowball") {
409+
analyzers->set_use_filter_snowball(ParseBool(name, value, error));
370410
} else {
371411
error = TStringBuilder() << "Unknown index setting: " << name;
372412
return false;

ydb/core/base/ut/fulltext_ut.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,22 @@ Y_UNIT_TEST_SUITE(NFulltext) {
270270
analyzers.set_filter_ngram_max_length(3);
271271
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "это", "те", "тек"}));
272272
}
273+
274+
Y_UNIT_TEST(AnalyzeFilterSnowball) {
275+
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
276+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
277+
const TString russianText = "машины ездят по дорогам исправно";
278+
279+
UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector<TString>{"машины", "ездят", "по", "дорогам", "исправно"}));
280+
281+
analyzers.set_use_filter_snowball(true);
282+
analyzers.set_language("russian");
283+
UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector<TString>{"машин", "езд", "по", "дорог", "исправн"}));
284+
285+
const TString englishText = "cars are driving properly on the roads";
286+
analyzers.set_language("english");
287+
UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector<TString>{"car", "are", "drive", "proper", "on", "the", "road"}));
288+
}
273289
}
274290

275291
}

ydb/core/base/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ SRCS(
8888
)
8989

9090
PEERDIR(
91+
contrib/libs/snowball
9192
ydb/library/actors/core
9293
ydb/library/actors/helpers
9394
ydb/library/actors/interconnect

ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,23 @@ void AddIndexCovered(NQuery::TQueryClient& db) {
9191
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
9292
}
9393

94+
void AddIndexSnowball(NQuery::TQueryClient& db) {
95+
TString query = R"sql(
96+
ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx
97+
GLOBAL USING fulltext
98+
ON (Text)
99+
WITH (
100+
layout=flat,
101+
tokenizer=standard,
102+
use_filter_lowercase=true,
103+
use_filter_snowball=true,
104+
language=english
105+
)
106+
)sql";
107+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
108+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
109+
}
110+
94111
TResultSet ReadIndex(NQuery::TQueryClient& db) {
95112
TString query = R"sql(
96113
SELECT * FROM `/Root/Texts/fulltext_idx/indexImplTable`;
@@ -224,6 +241,31 @@ Y_UNIT_TEST(AddIndexEdgeNGram) {
224241
])", NYdb::FormatResultSetYson(index));
225242
}
226243

244+
Y_UNIT_TEST(AddIndexSnowball) {
245+
auto kikimr = Kikimr();
246+
auto db = kikimr.GetQueryClient();
247+
248+
CreateTexts(db);
249+
UpsertTexts(db);
250+
AddIndexSnowball(db);
251+
const auto index = ReadIndex(db);
252+
CompareYson(R"([
253+
[[[100u];"anim"];
254+
[[100u];"cat"];
255+
[[200u];"cat"];
256+
[[300u];"cat"];
257+
[[100u];"chase"];
258+
[[200u];"chase"];
259+
[[200u];"dog"];
260+
[[400u];"dog"];
261+
[[400u];"fox"];
262+
[[300u];"love"];
263+
[[400u];"love"];
264+
[[100u];"small"];
265+
[[200u];"small"]
266+
])", NYdb::FormatResultSetYson(index));
267+
}
268+
227269
Y_UNIT_TEST(InsertRow) {
228270
auto kikimr = Kikimr();
229271
auto db = kikimr.GetQueryClient();

ydb/public/api/protos/ydb_table.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,10 @@ message FulltextIndexSettings {
229229
// Maximum token length to keep (inclusive)
230230
// Must be used with use_filter_length
231231
optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
232+
233+
// Wether to apply stemming for each token
234+
// TODO
235+
optional bool use_filter_snowball = 140;
232236
}
233237

234238
// Represents text analyzers settings for a specific column

0 commit comments

Comments
 (0)