Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include "olap/inverted_index_parser.h"

#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "util/string_util.h"

namespace doris {
Expand Down
1 change: 1 addition & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";
const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";

const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@

#include "analysis_factory_mgr.h"

#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
Expand All @@ -30,6 +33,10 @@ namespace doris::segment_v2::inverted_index {
void AnalysisFactoryMgr::initialise() {
static std::once_flag once_flag;
std::call_once(once_flag, [this]() {
// char_filter
registerFactory("char_replace",
[]() { return std::make_shared<CharReplaceCharFilterFactory>(); });

// tokenizer
registerFactory("standard", []() { return std::make_shared<StandardTokenizerFactory>(); });
registerFactory("keyword", []() { return std::make_shared<KeywordTokenizerFactory>(); });
Expand All @@ -38,6 +45,8 @@ void AnalysisFactoryMgr::initialise() {
[]() { return std::make_shared<EdgeNGramTokenizerFactory>(); });
registerFactory("char_group",
[]() { return std::make_shared<CharGroupTokenizerFactory>(); });
registerFactory("basic", []() { return std::make_shared<BasicTokenizerFactory>(); });
registerFactory("icu", []() { return std::make_shared<ICUTokenizerFactory>(); });

// token_filter
registerFactory("lowercase", []() { return std::make_shared<LowerCaseFilterFactory>(); });
Expand Down Expand Up @@ -75,4 +84,7 @@ template std::shared_ptr<TokenizerFactory> AnalysisFactoryMgr::create<TokenizerF
template std::shared_ptr<TokenFilterFactory> AnalysisFactoryMgr::create<TokenFilterFactory>(
const std::string&, const Settings&);

template std::shared_ptr<CharFilterFactory> AnalysisFactoryMgr::create<CharFilterFactory>(
const std::string&, const Settings&);

} // namespace doris::segment_v2::inverted_index
23 changes: 11 additions & 12 deletions be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,22 @@
#include "olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/ik/IKAnalyzer.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
#include "runtime/exec_env.h"
#include "runtime/index_policy/index_policy_mgr.h"
#include "util/runtime_profile.h"

namespace doris::segment_v2::inverted_index {
#include "common/compile_check_begin.h"

std::unique_ptr<lucene::util::Reader> InvertedIndexAnalyzer::create_reader(
CharFilterMap& char_filter_map) {
std::unique_ptr<lucene::util::Reader> reader =
std::make_unique<lucene::util::SStringReader<char>>();
ReaderPtr InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) {
ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
if (!char_filter_map.empty()) {
reader = std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(),
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
if (char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] ==
INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
reader = std::make_shared<CharReplaceCharFilter>(
reader, char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]);
}
}
return reader;
}
Expand Down Expand Up @@ -122,7 +121,7 @@ std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyz
}

std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer) {
ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
std::vector<TermInfo> analyse_result;

std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));
Expand Down Expand Up @@ -161,7 +160,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
inverted_index_ctx->analyzer = analyzer.get();
auto reader = create_reader(inverted_index_ctx->char_filter_map);
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
return get_analyse_result(reader.get(), analyzer.get());
return get_analyse_result(reader, analyzer.get());
}

bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "olap/inverted_index_parser.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/inverted_index/query/query.h"
#include "olap/rowset/segment_v2/inverted_index/util/reader.h"
#include "olap/rowset/segment_v2/inverted_index_query_type.h"

namespace lucene {
Expand All @@ -38,12 +39,12 @@ namespace doris::segment_v2::inverted_index {

class InvertedIndexAnalyzer {
public:
static std::unique_ptr<lucene::util::Reader> create_reader(CharFilterMap& char_filter_map);
static ReaderPtr create_reader(CharFilterMap& char_filter_map);

static std::shared_ptr<lucene::analysis::Analyzer> create_analyzer(
const InvertedIndexCtx* inverted_index_ctx);

static std::vector<TermInfo> get_analyse_result(lucene::util::Reader* reader,
static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
lucene::analysis::Analyzer* analyzer);

static std::vector<TermInfo> get_analyse_result(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

#pragma once

#include <memory>

#include "basic_tokenizer.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h"
#include "olap/rowset/segment_v2/inverted_index/token_stream.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h"

namespace doris::segment_v2 {

Expand All @@ -35,22 +35,47 @@ class BasicAnalyzer : public Analyzer {
bool isSDocOpt() override { return true; }

TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override {
auto* tokenizer = _CLNEW BasicTokenizer(_lowercase, _ownReader);
tokenizer->reset(reader);
return (TokenStream*)tokenizer;
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
"BasicAnalyzer::tokenStream not supported");
}

TokenStream* reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
if (_tokenizer == nullptr) {
_tokenizer = std::make_unique<BasicTokenizer>(_lowercase, _ownReader);
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
"BasicAnalyzer::reusableTokenStream not supported");
}

TokenStream* tokenStream(const TCHAR* fieldName,
const inverted_index::ReaderPtr& reader) override {
auto token_stream = create_components();
token_stream->set_reader(reader);
token_stream->get_token_stream()->reset();
return new inverted_index::TokenStreamWrapper(token_stream->get_token_stream());
}

TokenStream* reusableTokenStream(const TCHAR* fieldName,
const inverted_index::ReaderPtr& reader) override {
if (_reuse_token_stream == nullptr) {
_reuse_token_stream = create_components();
}
_tokenizer->reset(reader);
return (TokenStream*)_tokenizer.get();
_reuse_token_stream->set_reader(reader);
return _reuse_token_stream->get_token_stream().get();
};

private:
std::unique_ptr<BasicTokenizer> _tokenizer;
inverted_index::TokenStreamComponentsPtr create_components() {
auto tk = std::make_shared<inverted_index::BasicTokenizer>();
tk->initialize(inverted_index::BasicTokenizerMode::L1);
inverted_index::TokenStreamPtr ts = tk;
if (_lowercase) {
auto lower_case_filter = std::make_shared<inverted_index::LowerCaseFilter>(tk);
lower_case_filter->initialize();
ts = lower_case_filter;
}
return std::make_shared<inverted_index::TokenStreamComponents>(tk, ts);
}

inverted_index::TokenStreamComponentsPtr _reuse_token_stream;
};

} // namespace doris::segment_v2
Original file line number Diff line number Diff line change
Expand Up @@ -17,44 +17,54 @@

#include "custom_analyzer.h"

#include "common/status.h"
#include "olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h"
#include "olap/rowset/segment_v2/inverted_index/token_stream.h"
#include "runtime/exec_env.h"

namespace doris::segment_v2::inverted_index {

CustomAnalyzer::CustomAnalyzer(Builder* builder) {
_tokenizer = builder->_tokenizer;
_char_filters = builder->_char_filters;
_token_filters = builder->_token_filters;
}

TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) {
class TokenStreamWrapper : public TokenStream {
public:
explicit TokenStreamWrapper(std::shared_ptr<TokenStream> ts) : _impl(std::move(ts)) {}
~TokenStreamWrapper() override = default;

Token* next(Token* token) override { return _impl->next(token); }
void close() override { _impl->close(); }
void reset() override { _impl->reset(); }

private:
std::shared_ptr<TokenStream> _impl;
};
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
"CustomAnalyzer::tokenStream not supported");
}

TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) {
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
"CustomAnalyzer::reusableTokenStream not supported");
}

TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) {
auto r = init_reader(reader);
auto token_stream = create_components();
token_stream->set_reader(reader);
token_stream->set_reader(r);
token_stream->get_token_stream()->reset();
return new TokenStreamWrapper(token_stream->get_token_stream());
}

TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) {
TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName, const ReaderPtr& reader) {
auto r = init_reader(reader);
if (_reuse_token_stream == nullptr) {
_reuse_token_stream = create_components();
}
_reuse_token_stream->set_reader(reader);
_reuse_token_stream->set_reader(r);
return _reuse_token_stream->get_token_stream().get();
}

ReaderPtr CustomAnalyzer::init_reader(ReaderPtr reader) {
for (const auto& filter : _char_filters) {
reader = filter->create(reader);
}
return reader;
}

TokenStreamComponentsPtr CustomAnalyzer::create_components() {
auto tk = _tokenizer->create();
TokenStreamPtr ts = tk;
Expand All @@ -69,6 +79,9 @@ CustomAnalyzerPtr CustomAnalyzer::build_custom_analyzer(const CustomAnalyzerConf
throw Exception(ErrorCode::ILLEGAL_STATE, "Null configuration detected.");
}
CustomAnalyzer::Builder builder;
for (const auto& filter_config : config->get_char_filter_configs()) {
builder.add_char_filter(filter_config->get_name(), filter_config->get_params());
}
builder.with_tokenizer(config->get_tokenizer_config()->get_name(),
config->get_tokenizer_config()->get_params());
for (const auto& filter_config : config->get_token_filter_configs()) {
Expand All @@ -81,6 +94,10 @@ void CustomAnalyzer::Builder::with_tokenizer(const std::string& name, const Sett
_tokenizer = AnalysisFactoryMgr::instance().create<TokenizerFactory>(name, params);
}

void CustomAnalyzer::Builder::add_char_filter(const std::string& name, const Settings& params) {
_char_filters.push_back(AnalysisFactoryMgr::instance().create<CharFilterFactory>(name, params));
}

void CustomAnalyzer::Builder::add_token_filter(const std::string& name, const Settings& params) {
_token_filters.push_back(
AnalysisFactoryMgr::instance().create<TokenFilterFactory>(name, params));
Expand All @@ -93,7 +110,7 @@ CustomAnalyzerPtr CustomAnalyzer::Builder::build() {
return std::make_shared<CustomAnalyzer>(this);
}

void TokenStreamComponents::set_reader(CL_NS(util)::Reader* reader) {
void TokenStreamComponents::set_reader(const ReaderPtr& reader) {
_source->set_reader(reader);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,14 @@

#pragma once

#include "common/exception.h"
#include "olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/setting.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"

namespace doris::segment_v2::inverted_index {

class TokenStreamComponents;
using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>;

class CustomAnalyzer;
using CustomAnalyzerPtr = std::shared_ptr<CustomAnalyzer>;

Expand All @@ -40,11 +36,14 @@ class CustomAnalyzer : public Analyzer {
~Builder() = default;

void with_tokenizer(const std::string& name, const Settings& params);
void add_char_filter(const std::string& name, const Settings& params);
void add_token_filter(const std::string& name, const Settings& params);

CustomAnalyzerPtr build();

private:
TokenizerFactoryPtr _tokenizer;
std::vector<CharFilterFactoryPtr> _char_filters;
std::vector<TokenFilterFactoryPtr> _token_filters;

friend class CustomAnalyzer;
Expand All @@ -58,29 +57,20 @@ class CustomAnalyzer : public Analyzer {
TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override;
TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override;

TokenStream* tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override;
TokenStream* reusableTokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override;

static CustomAnalyzerPtr build_custom_analyzer(const CustomAnalyzerConfigPtr& config);

private:
ReaderPtr init_reader(ReaderPtr reader);
TokenStreamComponentsPtr create_components();

TokenizerFactoryPtr _tokenizer;
std::vector<CharFilterFactoryPtr> _char_filters;
std::vector<TokenFilterFactoryPtr> _token_filters;

TokenStreamComponentsPtr _reuse_token_stream;
};

class TokenStreamComponents {
public:
TokenStreamComponents(TokenizerPtr tokenizer, TokenStreamPtr result)
: _source(std::move(tokenizer)), _sink(std::move(result)) {}

void set_reader(CL_NS(util)::Reader* reader);
TokenStreamPtr get_token_stream();
TokenizerPtr get_source();

private:
TokenizerPtr _source;
TokenStreamPtr _sink;
};

} // namespace doris::segment_v2::inverted_index
Loading
Loading