|
1 | 1 | #include "fulltext.h" |
| 2 | + |
| 3 | +#include <contrib/libs/snowball/include/libstemmer.h> |
| 4 | + |
2 | 5 | #include <util/charset/utf8.h> |
3 | 6 | #include <util/generic/xrange.h> |
4 | 7 |
|
@@ -172,7 +175,28 @@ namespace { |
172 | 175 | return false; |
173 | 176 | } |
174 | 177 |
|
175 | | - if (settings.has_language()) { |
| 178 | + if (settings.use_filter_snowball()) { |
| 179 | + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { |
| 180 | + error = "cannot set use_filter_snowball with use_filter_ngam or use_filter_edge_ngram at the same time"; |
| 181 | + return false; |
| 182 | + } |
| 183 | + |
| 184 | + if (!settings.has_language()) { |
| 185 | + error = "language required when use_filter_snowball is set"; |
| 186 | + return false; |
| 187 | + } |
| 188 | + |
| 189 | + bool supportedLanguage = false; |
| 190 | + for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) { |
| 191 | + if (settings.language() == *ptr) { |
| 192 | + supportedLanguage = true; |
| 193 | + } |
| 194 | + } |
| 195 | + if (!supportedLanguage) { |
| 196 | + error = "language is not supported by snowball"; |
| 197 | + return false; |
| 198 | + } |
| 199 | + } else if (settings.has_language()) { |
176 | 200 | error = "Unsupported language setting"; |
177 | 201 | return false; |
178 | 202 | } |
@@ -268,6 +292,20 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet |
268 | 292 | }), tokens.end()); |
269 | 293 | } |
270 | 294 |
|
| 295 | + if (settings.use_filter_snowball()) { |
| 296 | + struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); |
| 297 | + for (auto& token : tokens) { |
| 298 | + const sb_symbol* stemmed = sb_stemmer_stem( |
| 299 | + stemmer, |
| 300 | + reinterpret_cast<const sb_symbol*>(token.data()), |
| 301 | + token.size() |
| 302 | + ); |
| 303 | + |
| 304 | + const size_t resultLength = sb_stemmer_length(stemmer); |
| 305 | + token = std::string(reinterpret_cast<const char*>(stemmed), resultLength); |
| 306 | + } |
| 307 | + } |
| 308 | + |
271 | 309 | if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { |
272 | 310 | TVector<TString> ngrams; |
273 | 311 | for (const auto& token : tokens) { |
@@ -367,6 +405,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam |
367 | 405 | analyzers->set_filter_length_min(ParseInt32(name, value, error)); |
368 | 406 | } else if (nameLower == "filter_length_max") { |
369 | 407 | analyzers->set_filter_length_max(ParseInt32(name, value, error)); |
| 408 | + } else if (nameLower == "use_filter_snowball") { |
| 409 | + analyzers->set_use_filter_snowball(ParseBool(name, value, error)); |
370 | 410 | } else { |
371 | 411 | error = TStringBuilder() << "Unknown index setting: " << name; |
372 | 412 | return false; |
|
0 commit comments