diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 1d892b9357dc34..7c89f02c51e089 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -17,7 +17,6 @@ #include "olap/inverted_index_parser.h" -#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "util/string_util.h" namespace doris { diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index a276d7de4f32ab..afd6e6619a3e1d 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -85,6 +85,7 @@ const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp index 51585e5580b11d..af6442525888f2 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp @@ -17,10 +17,13 @@ #include "analysis_factory_mgr.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h" #include "olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h" #include "olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h" #include "olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h" #include "olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h" @@ -30,6 +33,10 @@ namespace doris::segment_v2::inverted_index { void AnalysisFactoryMgr::initialise() { static std::once_flag once_flag; std::call_once(once_flag, [this]() { + // char_filter + registerFactory("char_replace", + []() { return std::make_shared(); }); + // tokenizer registerFactory("standard", []() { return std::make_shared(); }); registerFactory("keyword", []() { return std::make_shared(); }); @@ -38,6 +45,8 @@ void AnalysisFactoryMgr::initialise() { []() { return std::make_shared(); }); registerFactory("char_group", []() { return std::make_shared(); }); + registerFactory("basic", []() { return std::make_shared(); }); + registerFactory("icu", []() { return std::make_shared(); }); // token_filter registerFactory("lowercase", []() { return std::make_shared(); }); @@ -75,4 +84,7 @@ template std::shared_ptr AnalysisFactoryMgr::create AnalysisFactoryMgr::create( const std::string&, const Settings&); +template std::shared_ptr AnalysisFactoryMgr::create( + const std::string&, const Settings&); + } // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index f37b33410a33b2..7167bbd63eaa86 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -35,23 +35,22 @@ #include "olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/ik/IKAnalyzer.h" -#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h" #include "runtime/exec_env.h" #include "runtime/index_policy/index_policy_mgr.h" -#include "util/runtime_profile.h" namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" -std::unique_ptr InvertedIndexAnalyzer::create_reader( - CharFilterMap& char_filter_map) { - std::unique_ptr reader = - std::make_unique>(); +ReaderPtr InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) { + ReaderPtr reader = std::make_shared>(); if (!char_filter_map.empty()) { - reader = std::unique_ptr(CharFilterFactory::create( - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(), - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); + if (char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] == + INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { + reader = std::make_shared( + reader, char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]); + } } return reader; } @@ -122,7 +121,7 @@ std::shared_ptr InvertedIndexAnalyzer::create_analyz } std::vector InvertedIndexAnalyzer::get_analyse_result( - lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer) { + ReaderPtr reader, lucene::analysis::Analyzer* analyzer) { std::vector analyse_result; std::unique_ptr token_stream(analyzer->tokenStream(L"", reader)); @@ -161,7 +160,7 @@ std::vector InvertedIndexAnalyzer::get_analyse_result( inverted_index_ctx->analyzer = analyzer.get(); auto reader = create_reader(inverted_index_ctx->char_filter_map); reader->init(search_str.data(), static_cast(search_str.size()), true); - return get_analyse_result(reader.get(), analyzer.get()); + return get_analyse_result(reader, analyzer.get()); } bool InvertedIndexAnalyzer::should_analyzer(const std::map& properties) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h index 682c7cd9b52848..464d8df02cd959 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h @@ -23,6 +23,7 @@ #include "olap/inverted_index_parser.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/inverted_index/query/query.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" #include "olap/rowset/segment_v2/inverted_index_query_type.h" namespace lucene { @@ -38,12 +39,12 @@ namespace doris::segment_v2::inverted_index { class InvertedIndexAnalyzer { public: - static std::unique_ptr create_reader(CharFilterMap& char_filter_map); + static ReaderPtr create_reader(CharFilterMap& char_filter_map); static std::shared_ptr create_analyzer( const InvertedIndexCtx* inverted_index_ctx); - static std::vector get_analyse_result(lucene::util::Reader* reader, + static std::vector get_analyse_result(ReaderPtr reader, lucene::analysis::Analyzer* analyzer); static std::vector get_analyse_result( diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h index b9f4f96366600b..a080d6294c43b5 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h @@ -17,9 +17,9 @@ #pragma once -#include - -#include "basic_tokenizer.h" +#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h" +#include "olap/rowset/segment_v2/inverted_index/token_stream.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h" namespace doris::segment_v2 { @@ -35,22 +35,47 @@ class BasicAnalyzer : public Analyzer { bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - auto* tokenizer = _CLNEW BasicTokenizer(_lowercase, _ownReader); - tokenizer->reset(reader); - return (TokenStream*)tokenizer; + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "BasicAnalyzer::tokenStream not supported"); } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - if (_tokenizer == nullptr) { - _tokenizer = std::make_unique(_lowercase, _ownReader); + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "BasicAnalyzer::reusableTokenStream not supported"); + } + + TokenStream* tokenStream(const TCHAR* fieldName, + const inverted_index::ReaderPtr& reader) override { + auto token_stream = create_components(); + token_stream->set_reader(reader); + token_stream->get_token_stream()->reset(); + return new inverted_index::TokenStreamWrapper(token_stream->get_token_stream()); + } + + TokenStream* reusableTokenStream(const TCHAR* fieldName, + const inverted_index::ReaderPtr& reader) override { + if (_reuse_token_stream == nullptr) { + _reuse_token_stream = create_components(); } - _tokenizer->reset(reader); - return (TokenStream*)_tokenizer.get(); + _reuse_token_stream->set_reader(reader); + return _reuse_token_stream->get_token_stream().get(); }; private: - std::unique_ptr _tokenizer; + inverted_index::TokenStreamComponentsPtr create_components() { + auto tk = std::make_shared(); + tk->initialize(inverted_index::BasicTokenizerMode::L1); + inverted_index::TokenStreamPtr ts = tk; + if (_lowercase) { + auto lower_case_filter = std::make_shared(tk); + lower_case_filter->initialize(); + ts = lower_case_filter; + } + return std::make_shared(tk, ts); + } + + inverted_index::TokenStreamComponentsPtr _reuse_token_stream; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp index c90b71a2e36d2f..312abd523374e6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp @@ -17,44 +17,54 @@ #include "custom_analyzer.h" +#include "common/status.h" #include "olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h" +#include "olap/rowset/segment_v2/inverted_index/token_stream.h" #include "runtime/exec_env.h" namespace doris::segment_v2::inverted_index { CustomAnalyzer::CustomAnalyzer(Builder* builder) { _tokenizer = builder->_tokenizer; + _char_filters = builder->_char_filters; _token_filters = builder->_token_filters; } TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) { - class TokenStreamWrapper : public TokenStream { - public: - explicit TokenStreamWrapper(std::shared_ptr ts) : _impl(std::move(ts)) {} - ~TokenStreamWrapper() override = default; - - Token* next(Token* token) override { return _impl->next(token); } - void close() override { _impl->close(); } - void reset() override { _impl->reset(); } - - private: - std::shared_ptr _impl; - }; + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "CustomAnalyzer::tokenStream not supported"); +} + +TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName, + lucene::util::Reader* reader) { + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "CustomAnalyzer::reusableTokenStream not supported"); +} + +TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) { + auto r = init_reader(reader); auto token_stream = create_components(); - token_stream->set_reader(reader); + token_stream->set_reader(r); token_stream->get_token_stream()->reset(); return new TokenStreamWrapper(token_stream->get_token_stream()); } -TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName, - lucene::util::Reader* reader) { +TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName, const ReaderPtr& reader) { + auto r = init_reader(reader); if (_reuse_token_stream == nullptr) { _reuse_token_stream = create_components(); } - _reuse_token_stream->set_reader(reader); + _reuse_token_stream->set_reader(r); return _reuse_token_stream->get_token_stream().get(); } +ReaderPtr CustomAnalyzer::init_reader(ReaderPtr reader) { + for (const auto& filter : _char_filters) { + reader = filter->create(reader); + } + return reader; +} + TokenStreamComponentsPtr CustomAnalyzer::create_components() { auto tk = _tokenizer->create(); TokenStreamPtr ts = tk; @@ -69,6 +79,9 @@ CustomAnalyzerPtr CustomAnalyzer::build_custom_analyzer(const CustomAnalyzerConf throw Exception(ErrorCode::ILLEGAL_STATE, "Null configuration detected."); } CustomAnalyzer::Builder builder; + for (const auto& filter_config : config->get_char_filter_configs()) { + builder.add_char_filter(filter_config->get_name(), filter_config->get_params()); + } builder.with_tokenizer(config->get_tokenizer_config()->get_name(), config->get_tokenizer_config()->get_params()); for (const auto& filter_config : config->get_token_filter_configs()) { @@ -81,6 +94,10 @@ void CustomAnalyzer::Builder::with_tokenizer(const std::string& name, const Sett _tokenizer = AnalysisFactoryMgr::instance().create(name, params); } +void CustomAnalyzer::Builder::add_char_filter(const std::string& name, const Settings& params) { + _char_filters.push_back(AnalysisFactoryMgr::instance().create(name, params)); +} + void CustomAnalyzer::Builder::add_token_filter(const std::string& name, const Settings& params) { _token_filters.push_back( AnalysisFactoryMgr::instance().create(name, params)); @@ -93,7 +110,7 @@ CustomAnalyzerPtr CustomAnalyzer::Builder::build() { return std::make_shared(this); } -void TokenStreamComponents::set_reader(CL_NS(util)::Reader* reader) { +void TokenStreamComponents::set_reader(const ReaderPtr& reader) { _source->set_reader(reader); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h index 7b3bc0444aed1e..cb7294e7e6407d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h @@ -17,18 +17,14 @@ #pragma once -#include "common/exception.h" -#include "olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/setting.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h" namespace doris::segment_v2::inverted_index { -class TokenStreamComponents; -using TokenStreamComponentsPtr = std::shared_ptr; - class CustomAnalyzer; using CustomAnalyzerPtr = std::shared_ptr; @@ -40,11 +36,14 @@ class CustomAnalyzer : public Analyzer { ~Builder() = default; void with_tokenizer(const std::string& name, const Settings& params); + void add_char_filter(const std::string& name, const Settings& params); void add_token_filter(const std::string& name, const Settings& params); + CustomAnalyzerPtr build(); private: TokenizerFactoryPtr _tokenizer; + std::vector _char_filters; std::vector _token_filters; friend class CustomAnalyzer; @@ -58,29 +57,20 @@ class CustomAnalyzer : public Analyzer { TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override; TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override; + TokenStream* tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override; + TokenStream* reusableTokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override; + static CustomAnalyzerPtr build_custom_analyzer(const CustomAnalyzerConfigPtr& config); private: + ReaderPtr init_reader(ReaderPtr reader); TokenStreamComponentsPtr create_components(); TokenizerFactoryPtr _tokenizer; + std::vector _char_filters; std::vector _token_filters; TokenStreamComponentsPtr _reuse_token_stream; }; -class TokenStreamComponents { -public: - TokenStreamComponents(TokenizerPtr tokenizer, TokenStreamPtr result) - : _source(std::move(tokenizer)), _sink(std::move(result)) {} - - void set_reader(CL_NS(util)::Reader* reader); - TokenStreamPtr get_token_stream(); - TokenizerPtr get_source(); - -private: - TokenizerPtr _source; - TokenStreamPtr _sink; -}; - } // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp index f1d593ecfa53e0..161d267efb6ea5 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp @@ -23,6 +23,7 @@ namespace doris::segment_v2::inverted_index { CustomAnalyzerConfig::CustomAnalyzerConfig(Builder* builder) { _tokenizer_config = builder->_tokenizer_config; + _char_filters = builder->_char_filters; _token_filters = builder->_token_filters; } @@ -30,6 +31,10 @@ ComponentConfigPtr CustomAnalyzerConfig::get_tokenizer_config() { return _tokenizer_config; } +std::vector CustomAnalyzerConfig::get_char_filter_configs() { + return _char_filters; +} + std::vector CustomAnalyzerConfig::get_token_filter_configs() { return _token_filters; } @@ -39,6 +44,11 @@ void CustomAnalyzerConfig::Builder::with_tokenizer_config(const std::string& nam _tokenizer_config = std::make_shared(name, params); } +void CustomAnalyzerConfig::Builder::add_char_filter_config(const std::string& name, + const Settings& params) { + _char_filters.emplace_back(std::make_shared(name, params)); +} + void CustomAnalyzerConfig::Builder::add_token_filter_config(const std::string& name, const Settings& params) { _token_filters.emplace_back(std::make_shared(name, params)); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h index 0df8507e26afa0..134d4ee0d45d78 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h @@ -39,11 +39,13 @@ class CustomAnalyzerConfig { ~Builder() = default; void with_tokenizer_config(const std::string& name, const Settings& params); + void add_char_filter_config(const std::string& name, const Settings& params); void add_token_filter_config(const std::string& name, const Settings& params); CustomAnalyzerConfigPtr build(); private: ComponentConfigPtr _tokenizer_config; + std::vector _char_filters; std::vector _token_filters; friend class CustomAnalyzerConfig; @@ -53,10 +55,12 @@ class CustomAnalyzerConfig { ~CustomAnalyzerConfig() = default; ComponentConfigPtr get_tokenizer_config(); + std::vector get_char_filter_configs(); std::vector get_token_filter_configs(); private: ComponentConfigPtr _tokenizer_config; + std::vector _char_filters; std::vector _token_filters; }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h index 072cf85bc7d814..ccf27dfc8cb17c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h @@ -17,9 +17,9 @@ #pragma once -#include - -#include "icu_tokenizer.h" +#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h" +#include "olap/rowset/segment_v2/inverted_index/token_stream.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.h" namespace doris::segment_v2 { @@ -37,25 +37,48 @@ class ICUAnalyzer : public Analyzer { void initDict(const std::string& dictPath) override { dictPath_ = dictPath; } TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - auto* tokenizer = _CLNEW ICUTokenizer(_lowercase, _ownReader); - tokenizer->initialize(dictPath_); - tokenizer->reset(reader); - return (TokenStream*)tokenizer; + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "ICUAnalyzer::tokenStream not supported"); } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - if (tokenizer_ == nullptr) { - tokenizer_ = std::make_unique(_lowercase, _ownReader); - tokenizer_->initialize(dictPath_); + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, + "ICUAnalyzer::reusableTokenStream not supported"); + } + + TokenStream* tokenStream(const TCHAR* fieldName, + const inverted_index::ReaderPtr& reader) override { + auto token_stream = create_components(); + token_stream->set_reader(reader); + token_stream->get_token_stream()->reset(); + return new inverted_index::TokenStreamWrapper(token_stream->get_token_stream()); + } + + TokenStream* reusableTokenStream(const TCHAR* fieldName, + const inverted_index::ReaderPtr& reader) override { + if (_reuse_token_stream == nullptr) { + _reuse_token_stream = create_components(); } - tokenizer_->reset(reader); - return (TokenStream*)tokenizer_.get(); + _reuse_token_stream->set_reader(reader); + return _reuse_token_stream->get_token_stream().get(); }; private: + inverted_index::TokenStreamComponentsPtr create_components() { + auto tk = std::make_shared(); + tk->initialize(dictPath_); + inverted_index::TokenStreamPtr ts = tk; + if (_lowercase) { + auto lower_case_filter = std::make_shared(tk); + lower_case_filter->initialize(); + ts = lower_case_filter; + } + return std::make_shared(tk, ts); + } + std::string dictPath_; - std::unique_ptr tokenizer_; + inverted_index::TokenStreamComponentsPtr _reuse_token_stream; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter.h new file mode 100644 index 00000000000000..7e4c3a849f3446 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/exception.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" + +namespace doris::segment_v2::inverted_index { + +class DorisCharFilter : public lucene::util::Reader { +public: + DorisCharFilter(ReaderPtr reader) : _reader(std::move(reader)) {} + ~DorisCharFilter() override = default; + + virtual void initialize() = 0; + + int64_t position() override { + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, "CharFilter::position"); + } + + int64_t skip(int64_t ntoskip) override { + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, "CharFilter::skip"); + } + + size_t size() override { + throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED, "CharFilter::size"); + } + +protected: + ReaderPtr _reader; +}; +using CharFilterPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h index bebbea58f72d86..925f9adaf6e9ed 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h @@ -17,22 +17,18 @@ #pragma once -#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h" +#include "olap/rowset/segment_v2/inverted_index/abstract_analysis_factory.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" -namespace doris { +namespace doris::segment_v2::inverted_index { -static const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; - -class CharFilterFactory { +class CharFilterFactory : public AbstractAnalysisFactory { public: - template - static lucene::analysis::CharFilter* create(const std::string& name, Args&&... args) { - DBUG_EXECUTE_IF("CharFilterFactory::create_return_nullptr", { return nullptr; }) - if (name == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { - return new CharReplaceCharFilter(std::forward(args)...); - } - return nullptr; - } + CharFilterFactory() = default; + ~CharFilterFactory() override = default; + + virtual ReaderPtr create(const ReaderPtr& in) = 0; }; +using CharFilterFactoryPtr = std::shared_ptr; -} // namespace doris \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp index a75bef53d4554c..e2f6b663070daf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp @@ -19,16 +19,24 @@ #include -namespace doris { +namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" -CharReplaceCharFilter::CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern, - const std::string& replacement) - : CharFilter(in), _replacement(replacement) { - std::for_each(pattern.begin(), pattern.end(), [this](uint8_t c) { _patterns.set(c); }); + +CharReplaceCharFilter::CharReplaceCharFilter(ReaderPtr reader, const std::string& pattern, + std::string replacement) + : DorisCharFilter(std::move(reader)), _replacement(std::move(replacement)) { + std::ranges::for_each(pattern, [this](uint8_t c) { _patterns.set(c); }); +} + +void CharReplaceCharFilter::initialize() { + if (_transformed_input.size() != 0) { + return; + } + fill(); } void CharReplaceCharFilter::init(const void* _value, int32_t _length, bool copyData) { - input_->init(_value, _length, copyData); + _reader->init(_value, _length, copyData); fill(); } @@ -41,8 +49,8 @@ int32_t CharReplaceCharFilter::readCopy(void* start, int32_t off, int32_t len) { } void CharReplaceCharFilter::fill() { - _buf.resize(input_->size()); - input_->readCopy(_buf.data(), 0, static_cast(_buf.size())); + _buf.resize(_reader->size()); + _reader->readCopy(_buf.data(), 0, static_cast(_buf.size())); process_pattern(_buf); _transformed_input.init(_buf.data(), static_cast(_buf.size()), false); } @@ -56,5 +64,5 @@ void CharReplaceCharFilter::process_pattern(std::string& buf) { } } -} // namespace doris #include "common/compile_check_end.h" +} // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h index 1e5e6f5d5cedd0..082c80ffc52fde 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h @@ -17,19 +17,19 @@ #pragma once -#include // IWYU pragma: keep -#include - #include -namespace doris { +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter.h" + +namespace doris::segment_v2::inverted_index { -class CharReplaceCharFilter : public lucene::analysis::CharFilter { +class CharReplaceCharFilter : public DorisCharFilter { public: - CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern, - const std::string& replacement); + CharReplaceCharFilter(ReaderPtr in, const std::string& pattern, std::string replacement); ~CharReplaceCharFilter() override = default; + void initialize() override; + void init(const void* _value, int32_t _length, bool copyData) override; int32_t read(const void** start, int32_t min, int32_t max) override; int32_t readCopy(void* start, int32_t off, int32_t len) override; @@ -47,4 +47,4 @@ class CharReplaceCharFilter : public lucene::analysis::CharFilter { lucene::util::SStringReader _transformed_input; }; -} // namespace doris \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h new file mode 100644 index 00000000000000..debdb59107176c --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h" + +namespace doris::segment_v2::inverted_index { + +static const std::string CHAR_REPLACE_PATTERN = "pattern"; +static const std::string CHAR_REPLACE_REPLACEMENT = "replacement"; + +static const std::string CHAR_REPLACE_DEFAULT_PATTERN = ",._"; + +class CharReplaceCharFilterFactory : public CharFilterFactory { +public: + CharReplaceCharFilterFactory() = default; + ~CharReplaceCharFilterFactory() override = default; + + void initialize(const Settings& settings) override { + _pattern = settings.get_string(CHAR_REPLACE_PATTERN, CHAR_REPLACE_DEFAULT_PATTERN); + if (_pattern.empty()) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Missing '${CHAR_REPLACE_PATTERN}' for char_replace filter type"); + } + for (char ch : _pattern) { + unsigned int uc = static_cast(ch); + if (uc > 255) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Invalid '${CHAR_REPLACE_PATTERN}' for char_replace " + "filter type: each char must " + "be in [0,255]"); + } + } + _replacement = settings.get_string(CHAR_REPLACE_REPLACEMENT, " "); + if (_replacement.size() != 1) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Invalid '${CHAR_REPLACE_REPLACEMENT}' for char_replace " + "filter type: must be exactly 1 byte"); + } + unsigned int rep = static_cast(_replacement[0]); + if (rep > 255) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Invalid '${CHAR_REPLACE_REPLACEMENT}' for char_replace " + "filter type: must be in [0,255]"); + } + } + + ReaderPtr create(const ReaderPtr& reader) override { + auto r = std::make_shared(reader, _pattern, _replacement); + r->initialize(); + return r; + } + +private: + std::string _pattern; + std::string _replacement; +}; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/setting.h b/be/src/olap/rowset/segment_v2/inverted_index/setting.h index e06deca5e4f1d6..51782ab0b2de5d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/setting.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/setting.h @@ -22,8 +22,7 @@ #include #include #include -#include -#include +#include #include #include @@ -77,25 +76,65 @@ class Settings { return default_value; } - std::string get_string(const std::string& key) const { + std::string get_string(const std::string& key, const std::string& default_value = "") const { auto it = _args.find(key); if (it != _args.end()) { return it->second; } - return ""; + return default_value; } std::vector get_entry_list(const std::string& key) const { + static const boost::regex sep(R"((?<=\])\s*,\s*(?=\[))"); std::vector lists; auto it = _args.find(key); if (it != _args.end()) { - static std::regex pattern(R"(\[([^\]]+)\])"); - std::smatch match; - std::sregex_iterator iter(it->second.begin(), it->second.end(), pattern); - std::sregex_iterator end; - for (; iter != end; ++iter) { - if (iter->size() > 1) { - lists.emplace_back((*iter)[1].str()); + std::string trimmed_input = boost::algorithm::trim_copy(it->second); + if (trimmed_input.empty()) { + return lists; + } + + auto validate_single = [&](const std::string& item, const std::string& prefix) { + if (item.size() < 2 || item.front() != '[' || item.back() != ']') { + throw Exception(ErrorCode::INVALID_ARGUMENT, + prefix + key + " must be enclosed in []"); + } + int depth = 0; + for (size_t i = 0; i + 1 < item.size(); ++i) { + char c = item[i]; + if (c == '[') { + ++depth; + } else if (c == ']') { + --depth; + if (depth == 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + prefix + key + " must be enclosed in []"); + } + } + } + }; + + if (boost::regex_search(trimmed_input, sep)) { + boost::sregex_token_iterator regex_it(trimmed_input.begin(), trimmed_input.end(), + sep, -1); + boost::sregex_token_iterator end; + for (; regex_it != end; ++regex_it) { + std::string item = boost::algorithm::trim_copy(regex_it->str()); + validate_single(item, "Each item in "); + std::string content = item.substr(1, item.size() - 2); + if (!content.empty()) { + lists.emplace_back(content); + } + } + } else { + if (trimmed_input.size() < 2 || trimmed_input.front() != '[' || + trimmed_input.back() != ']') { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Item in " + key + " must be enclosed in []"); + } + std::string content = trimmed_input.substr(1, trimmed_input.size() - 2); + if (!content.empty()) { + lists.emplace_back(content); } } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/token_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/token_filter.h index 41631ae6e44a19..0271db5a39292e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/token_filter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/token_filter.h @@ -19,8 +19,6 @@ #include "olap/rowset/segment_v2/inverted_index/token_stream.h" -using TokenStreamPtr = std::shared_ptr; - namespace doris::segment_v2::inverted_index { class DorisTokenFilter : public TokenFilter, public DorisTokenStream { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h index a125150bcadd89..9e0e60b73a0ed9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h @@ -17,6 +17,8 @@ #pragma once +#include + #include "olap/rowset/segment_v2/inverted_index/setting.h" #include "token_filter_factory.h" #include "word_delimiter_filter.h" diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_stream.h b/be/src/olap/rowset/segment_v2/inverted_index/token_stream.h index c2850f779922a3..b352a1f2cc7b00 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/token_stream.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_stream.h @@ -21,19 +21,22 @@ #include #include -#include #include "CLucene.h" #include "CLucene/analysis/AnalysisHeader.h" #include "common/cast_set.h" -#include "common/exception.h" -#include "common/logging.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" using namespace lucene::analysis; namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" +class DorisTokenizer; +using TokenizerPtr = std::shared_ptr; + +using TokenStreamPtr = std::shared_ptr; + /** * All custom tokenizers and token_filters must use the following functions * to set token information. Using these unified set methods helps avoid @@ -59,5 +62,33 @@ class DorisTokenStream { void set_position_increment(Token* t, int32_t pos) { t->setPositionIncrement(pos); } }; +class TokenStreamWrapper : public TokenStream { +public: + explicit TokenStreamWrapper(std::shared_ptr ts) : _impl(std::move(ts)) {} + ~TokenStreamWrapper() override = default; + + Token* next(Token* token) override { return _impl->next(token); } + void close() override { _impl->close(); } + void reset() override { _impl->reset(); } + +private: + std::shared_ptr _impl; +}; + +class TokenStreamComponents { +public: + TokenStreamComponents(TokenizerPtr tokenizer, TokenStreamPtr result) + : _source(std::move(tokenizer)), _sink(std::move(result)) {} + + void set_reader(const ReaderPtr& reader); + TokenStreamPtr get_token_stream(); + TokenizerPtr get_source(); + +private: + TokenizerPtr _source; + TokenStreamPtr _sink; +}; +using TokenStreamComponentsPtr = std::shared_ptr; + }; // namespace doris::segment_v2::inverted_index #include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.cpp similarity index 73% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.cpp index 0679fdbdd26da8..2ac699dae3fffb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.cpp @@ -19,7 +19,7 @@ #include -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" #define IS_IN_RANGE(c, start, end) ((uint32_t)((c) - (start)) <= ((end) - (start))) @@ -29,14 +29,12 @@ namespace doris::segment_v2 { IS_IN_RANGE(c, 0x20000, 0x2A6DF) || IS_IN_RANGE(c, 0x2A700, 0x2EBEF) || \ IS_IN_RANGE(c, 0x30000, 0x3134A)) -BasicTokenizer::BasicTokenizer() { - this->lowercase = false; - this->ownReader = false; +BasicTokenizer::BasicTokenizer(bool own_reader) { + this->ownReader = own_reader; } -BasicTokenizer::BasicTokenizer(bool lower_case, bool own_reader) : BasicTokenizer() { - this->lowercase = lower_case; - this->ownReader = own_reader; +void BasicTokenizer::initialize(BasicTokenizerMode mode) { + _mode = mode; } Token* BasicTokenizer::next(Token* token) { @@ -50,21 +48,28 @@ Token* BasicTokenizer::next(Token* token) { return token; } -void BasicTokenizer::reset(lucene::util::Reader* reader) { +void BasicTokenizer::reset() { + DorisTokenizer::reset(); + _buffer_index = 0; _data_len = 0; _tokens_text.clear(); - _buffer.resize(reader->size()); - size_t numRead = reader->readCopy(_buffer.data(), 0, static_cast(_buffer.size())); + _buffer.resize(_in->size()); + size_t numRead = _in->readCopy(_buffer.data(), 0, static_cast(_buffer.size())); (void)numRead; assert(_buffer.size() == numRead); - cut(); + if (_mode == BasicTokenizerMode::L1) { + cut(); + } else if (_mode == BasicTokenizerMode::L2) { + cut(); + } _data_len = static_cast(_tokens_text.size()); } +template void BasicTokenizer::cut() { auto* s = (uint8_t*)_buffer.data(); auto length = static_cast(_buffer.size()); @@ -97,7 +102,15 @@ void BasicTokenizer::cut() { continue; } - if (IS_CHINESE_CHAR(c)) { + if constexpr (mode == BasicTokenizerMode::L1) { + if (IS_CHINESE_CHAR(c)) { + const int32_t len = i - prev_i; + _tokens_text.emplace_back(reinterpret_cast(s + prev_i), len); + } + } else if constexpr (mode == BasicTokenizerMode::L2) { + if (u_hasBinaryProperty(c, UCHAR_WHITE_SPACE)) { + continue; + } const int32_t len = i - prev_i; _tokens_text.emplace_back(reinterpret_cast(s + prev_i), len); } @@ -106,4 +119,4 @@ void BasicTokenizer::cut() { } #include "common/compile_check_end.h" -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h similarity index 65% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h index e07a5e37d78a9a..e317de55fa81d5 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h @@ -17,32 +17,38 @@ #pragma once -#include - -#include "CLucene.h" -#include "CLucene/analysis/AnalysisHeader.h" -#include "CLucene/analysis/icu/ICUCommon.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h" using namespace lucene::analysis; -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { + +enum class BasicTokenizerMode { + L1 = 1, // English + numbers + Chinese tokenization + L2 = 2 // L1 + all Unicode characters tokenized +}; -class BasicTokenizer : public Tokenizer { +class BasicTokenizer : public DorisTokenizer { public: - BasicTokenizer(); - BasicTokenizer(bool lowercase, bool ownReader); + BasicTokenizer() = default; + BasicTokenizer(bool own_reader); ~BasicTokenizer() override = default; + void initialize(BasicTokenizerMode mode); + Token* next(Token* token) override; - void reset(lucene::util::Reader* reader) override; + void reset() override; +private: + template void cut(); -private: int32_t _buffer_index = 0; int32_t _data_len = 0; std::string _buffer; std::vector _tokens_text; + + BasicTokenizerMode _mode = BasicTokenizerMode::L1; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h new file mode 100644 index 00000000000000..58aa43dbc06762 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "basic_tokenizer.h" +#include "common/exception.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h" + +namespace doris::segment_v2::inverted_index { + +class BasicTokenizerFactory : public TokenizerFactory { +public: + BasicTokenizerFactory() = default; + ~BasicTokenizerFactory() override = default; + + void initialize(const Settings& settings) override { + int32_t mode = settings.get_int("mode", static_cast(BasicTokenizerMode::L1)); + if (mode < 1 || mode > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid mode for basic tokenizer: {}", + mode); + } + _mode = static_cast(mode); + } + + TokenizerPtr create() override { + auto tokenzier = std::make_shared(); + tokenzier->initialize(_mode); + return tokenzier; + } + +private: + BasicTokenizerMode _mode = BasicTokenizerMode::L1; +}; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.cpp similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.cpp index 50094e54f7bf6c..0ed63c28a152e6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.cpp @@ -25,7 +25,7 @@ #include "icu_common.h" #include "icu_tokenizer_config.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { icu::UnicodeSet BreakIteratorWrapper::EMOJI_RK; icu::UnicodeSet BreakIteratorWrapper::EMOJI; @@ -104,4 +104,4 @@ void BreakIteratorWrapper::set_text(const UChar* text, int32_t start, int32_t le status_ = UBRK_WORD_NONE; } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.h similarity index 94% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.h index dea60d1d1f7fad..554b02c9eaf4f0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/break_iterator_wrapper.h @@ -25,7 +25,7 @@ #include "icu_common.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { class BreakIteratorWrapper { public: @@ -51,4 +51,4 @@ class BreakIteratorWrapper { }; using BreakIteratorWrapperPtr = std::unique_ptr; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.cpp similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.cpp index e178ad35c13917..5a4d56d11e6cc0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.cpp @@ -21,7 +21,7 @@ #include -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { CompositeBreakIterator::CompositeBreakIterator(const ICUTokenizerConfigPtr& config) : config_(config) { @@ -80,4 +80,4 @@ BreakIteratorWrapper* CompositeBreakIterator::get_break_iterator(int32_t scriptC return word_breakers_[scriptCode].get(); } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.h similarity index 94% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.h index 8599be88dc2ce1..4d5e2a6b4d6bea 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/composite_break_iterator.h @@ -29,7 +29,7 @@ #include "icu_tokenizer_config.h" #include "script_iterator.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { class CompositeBreakIterator { public: @@ -55,4 +55,4 @@ class CompositeBreakIterator { }; using CompositeBreakIteratorPtr = std::unique_ptr; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.cpp similarity index 98% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.cpp index dfbcf2dcdf65a0..4d9d8d9b2a73c1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.cpp @@ -24,7 +24,7 @@ #include #include -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { BreakIteratorPtr DefaultICUTokenizerConfig::cjk_break_iterator_; BreakIteratorPtr DefaultICUTokenizerConfig::default_break_iterator_; @@ -125,4 +125,4 @@ void DefaultICUTokenizerConfig::read_break_iterator(BreakIteratorPtr& rbbi, } } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.h similarity index 94% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.h index 6500cf230ebb03..21e9359e5a3288 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/default_icu_tokenizer_config.h @@ -19,7 +19,7 @@ #include "icu_tokenizer_config.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { class DefaultICUTokenizerConfig : public ICUTokenizerConfig { public: @@ -41,4 +41,4 @@ class DefaultICUTokenizerConfig : public ICUTokenizerConfig { bool myanmar_as_words_ = false; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_common.h similarity index 93% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_common.h index 1cdffab48d3dd5..0066e26ec9b243 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_common.h @@ -31,7 +31,7 @@ #include "unicode/utext.h" #include "unicode/utf8.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { using BreakIteratorPtr = std::unique_ptr; @@ -45,4 +45,4 @@ struct UTextDeleter { using UTextPtr = std::unique_ptr; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.cpp similarity index 84% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.cpp index 670ae6c2d08d38..e8723ff077e5fd 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.cpp @@ -22,18 +22,15 @@ #include #include -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" -ICUTokenizer::ICUTokenizer() { - this->lowercase = false; - this->ownReader = false; +ICUTokenizer::ICUTokenizer() { config_ = std::make_shared(true, true); breaker_ = std::make_unique(config_); } -ICUTokenizer::ICUTokenizer(bool lower_case, bool own_reader) : ICUTokenizer() { - this->lowercase = lower_case; +ICUTokenizer::ICUTokenizer(bool own_reader) : ICUTokenizer() { this->ownReader = own_reader; } @@ -69,9 +66,10 @@ Token* ICUTokenizer::next(Token* token) { return token; } -void ICUTokenizer::reset(lucene::util::Reader* reader) { +void ICUTokenizer::reset() { + DorisTokenizer::reset(); const char* buf = nullptr; - int32_t len = reader->read((const void**)&buf, 0, static_cast(reader->size())); + int32_t len = _in->read((const void**)&buf, 0, static_cast(_in->size())); buffer_ = icu::UnicodeString::fromUTF8(icu::StringPiece(buf, len)); if (!buffer_.isEmpty() && buffer_.isBogus()) { _CLTHROWT(CL_ERR_Runtime, "Failed to convert UTF-8 string to UnicodeString."); @@ -79,5 +77,5 @@ void ICUTokenizer::reset(lucene::util::Reader* reader) { breaker_->set_text(buffer_.getBuffer(), 0, buffer_.length()); } -} // namespace doris::segment_v2 -#include "common/compile_check_end.h" \ No newline at end of file +#include "common/compile_check_end.h" +} // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.h similarity index 82% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.h index d11d0c67ed6b7c..bad250ea6e866f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer.h @@ -19,25 +19,24 @@ #include -#include "CLucene.h" -#include "CLucene/analysis/AnalysisHeader.h" #include "composite_break_iterator.h" #include "default_icu_tokenizer_config.h" #include "icu_common.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h" using namespace lucene::analysis; -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { -class ICUTokenizer : public Tokenizer { +class ICUTokenizer : public DorisTokenizer { public: ICUTokenizer(); - ICUTokenizer(bool lowercase, bool ownReader); + ICUTokenizer(bool ownReader); ~ICUTokenizer() override = default; void initialize(const std::string& dictPath); Token* next(Token* token) override; - void reset(lucene::util::Reader* reader) override; + void reset() override; private: std::string utf8Str_; @@ -47,4 +46,4 @@ class ICUTokenizer : public Tokenizer { CompositeBreakIteratorPtr breaker_; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_config.h similarity index 93% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_config.h index dd7b743e74b944..af3a1f3bee36e3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_config.h @@ -19,7 +19,7 @@ #include "icu_common.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { class ICUTokenizerConfig { public: @@ -34,4 +34,4 @@ class ICUTokenizerConfig { }; using ICUTokenizerConfigPtr = std::shared_ptr; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h new file mode 100644 index 00000000000000..f750d652ad7f9a --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "icu_tokenizer.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h" + +namespace doris::segment_v2::inverted_index { + +class ICUTokenizerFactory : public TokenizerFactory { +public: + ICUTokenizerFactory() = default; + ~ICUTokenizerFactory() override = default; + + void initialize(const Settings& settings) override {} + + TokenizerPtr create() override { + auto tokenizer = std::make_shared(); + tokenizer->initialize(config::inverted_index_dict_path + "/icu"); + return tokenizer; + } +}; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.cpp similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.cpp index 7fee3055d3bc87..c742991d1f07df 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.cpp @@ -22,7 +22,7 @@ #include #include -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { std::vector ScriptIterator::k_basic_latin(128); @@ -118,4 +118,4 @@ bool ScriptIterator::is_combining_mark(UChar32 codepoint) { type == U_ENCLOSING_MARK); } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.h similarity index 95% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h rename to be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.h index bc93eea8670409..3db78d25c71891 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/icu/script_iterator.h @@ -25,7 +25,7 @@ #include "icu_common.h" -namespace doris::segment_v2 { +namespace doris::segment_v2::inverted_index { class ScriptIterator { public: @@ -61,4 +61,4 @@ class ScriptIterator { }; using ScriptIteratorPtr = std::unique_ptr; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h index b707e9add33ff6..2b27f6b4756a1f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h @@ -24,8 +24,7 @@ #include #include -#include "CLucene.h" -#include "CLucene/analysis/AnalysisHeader.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" namespace doris::segment_v2::inverted_index { #include "common/compile_check_begin.h" @@ -168,7 +167,7 @@ class StandardTokenizerImpl { return {_zz_buffer.data() + _zz_start_read, (size_t)(_zz_marked_pos - _zz_start_read)}; } - inline void yyreset(CL_NS(util)::Reader* reader) { + inline void yyreset(const ReaderPtr& reader) { _zz_reader = reader; _zz_at_eof = false; _zz_current_pos = 0; @@ -283,7 +282,7 @@ class StandardTokenizerImpl { static const int32_t ZZ_BUFFERSIZE = 255; - CL_NS(util)::Reader* _zz_reader = nullptr; + ReaderPtr _zz_reader; std::string _zz_buffer; int32_t _zz_state = 0; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h index 6a7119b5fc56aa..8b7898f833702d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h @@ -22,8 +22,7 @@ #include #include "olap/rowset/segment_v2/inverted_index/token_stream.h" - -using TokenStreamPtr = std::shared_ptr; +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" namespace doris::segment_v2::inverted_index { @@ -32,7 +31,7 @@ class DorisTokenizer : public Tokenizer, public DorisTokenStream { DorisTokenizer() = default; ~DorisTokenizer() override = default; - void set_reader(CL_NS(util)::Reader* in) { + void set_reader(const ReaderPtr& in) { if (in == nullptr) { throw Exception(ErrorCode::INVALID_ARGUMENT, "reader must not be null"); } @@ -44,8 +43,8 @@ class DorisTokenizer : public Tokenizer, public DorisTokenStream { void reset() override { _in = _in_pending; }; protected: - CL_NS(util)::Reader* _in = nullptr; - CL_NS(util)::Reader* _in_pending = nullptr; + ReaderPtr _in; + ReaderPtr _in_pending; }; using TokenizerPtr = std::shared_ptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/util/reader.h b/be/src/olap/rowset/segment_v2/inverted_index/util/reader.h new file mode 100644 index 00000000000000..27428f491b470b --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/util/reader.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "CLucene.h" +#include "CLucene/util/CLStreams.h" + +namespace doris::segment_v2::inverted_index { + +using ReaderPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 3a862367cad29b..13aa3356f2daae 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -22,7 +22,6 @@ #include "olap/rowset/segment_v2/inverted_index_common.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/tablet_schema.h" -#include "olap/types.h" #include "util/faststring.h" namespace doris::segment_v2 { @@ -110,8 +109,8 @@ Status InvertedIndexColumnWriter::init_bkd_index() { } template -Result> -InvertedIndexColumnWriter::create_char_string_reader(CharFilterMap& char_filter_map) { +Result InvertedIndexColumnWriter::create_char_string_reader( + CharFilterMap& char_filter_map) { try { return inverted_index::InvertedIndexAnalyzer::create_reader(char_filter_map); } catch (CLuceneError& e) { @@ -339,7 +338,7 @@ void InvertedIndexColumnWriter::new_char_token_stream(const char* s, _CLTHROWA(CL_ERR_UnsupportedOperation, "UnsupportedOperationException: CLStream::init"); }) - auto* stream = _analyzer->reusableTokenStream(field->name(), _char_string_reader.get()); + auto* stream = _analyzer->reusableTokenStream(field->name(), _char_string_reader); field->setValue(stream); } @@ -408,6 +407,7 @@ Status InvertedIndexColumnWriter::add_array_values(size_t field_size return Status::InternalError("index writer is null in inverted index writer"); } size_t start_off = 0; + std::vector keep_readers; for (size_t i = 0; i < count; ++i) { // nullmap & value ptr-array may not from offsets[i] because olap_convertor make offsets accumulate from _base_offset which may not is 0, but nullmap & value in this segment is from 0, we only need // every single array row element size to go through the nullmap & value ptr-array, and also can go through the every row in array to keep with _rid++ @@ -447,15 +447,13 @@ Status InvertedIndexColumnWriter::add_array_values(size_t field_size // in this case stream need to delete after add_document, because the // stream can not reuse for different field bool own_token_stream = true; - bool own_reader = true; - std::unique_ptr char_string_reader = DORIS_TRY( + ReaderPtr char_string_reader = DORIS_TRY( create_char_string_reader(_inverted_index_ctx->char_filter_map)); char_string_reader->init(v->get_data(), cast_set(v->get_size()), false); - _analyzer->set_ownReader(own_reader); - ts = _analyzer->tokenStream(new_field->name(), - char_string_reader.release()); + ts = _analyzer->tokenStream(new_field->name(), char_string_reader); new_field->setValue(ts, own_token_stream); + keep_readers.emplace_back(std::move(char_string_reader)); } else { new_field_char_value(v->get_data(), v->get_size(), new_field.get()); } @@ -507,6 +505,7 @@ Status InvertedIndexColumnWriter::add_array_values(size_t field_size _doc->clear(); } _rid++; + keep_readers.clear(); } } else if constexpr (field_is_numeric_type(field_type)) { size_t start_off = 0; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 361334c09bc322..5835b09b0d6c22 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -28,6 +28,7 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/index_file_writer.h" #include "olap/rowset/segment_v2/index_writer.h" +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" namespace doris { @@ -38,6 +39,8 @@ struct CppTypeTraits; namespace segment_v2 { +using namespace doris::segment_v2::inverted_index; + template class InvertedIndexColumnWriter : public IndexColumnWriter { public: @@ -50,8 +53,7 @@ class InvertedIndexColumnWriter : public IndexColumnWriter { Status init() override; void close_on_error() override; Status init_bkd_index(); - Result> create_char_string_reader( - CharFilterMap& char_filter_map); + Result create_char_string_reader(CharFilterMap& char_filter_map); Status open_index_directory(); std::unique_ptr create_index_writer(); Status create_field(lucene::document::Field** field); @@ -93,7 +95,7 @@ class InvertedIndexColumnWriter : public IndexColumnWriter { std::unique_ptr _index_writer = nullptr; std::shared_ptr _analyzer = nullptr; std::unique_ptr _similarity = nullptr; - std::unique_ptr _char_string_reader = nullptr; + ReaderPtr _char_string_reader = nullptr; std::shared_ptr _bkd_writer = nullptr; InvertedIndexCtxSPtr _inverted_index_ctx = nullptr; const KeyCoder* _value_key_coder; diff --git a/be/src/runtime/index_policy/index_policy_mgr.cpp b/be/src/runtime/index_policy/index_policy_mgr.cpp index f032c976999a84..035dd19f35e802 100644 --- a/be/src/runtime/index_policy/index_policy_mgr.cpp +++ b/be/src/runtime/index_policy/index_policy_mgr.cpp @@ -130,44 +130,66 @@ segment_v2::inverted_index::CustomAnalyzerPtr IndexPolicyMgr::get_policy_by_name builder.with_tokenizer_config(tokenzier_name, {}); } + // Process char filters + process_filter_configs(index_policy_analyzer, PROP_CHAR_FILTER, "char filter", + [&builder](const std::string& name, + const segment_v2::inverted_index::Settings& settings) { + builder.add_char_filter_config(name, settings); + }); + // Process token filters - auto token_filter_it = index_policy_analyzer.properties.find(PROP_TOKEN_FILTER); - if (token_filter_it != index_policy_analyzer.properties.end()) { - std::vector token_filter_strs; - boost::split(token_filter_strs, token_filter_it->second, boost::is_any_of(",")); - - for (auto& filter_name : token_filter_strs) { - boost::trim(filter_name); - if (filter_name.empty()) { - continue; - } + process_filter_configs(index_policy_analyzer, PROP_TOKEN_FILTER, "token filter", + [&builder](const std::string& name, + const segment_v2::inverted_index::Settings& settings) { + builder.add_token_filter_config(name, settings); + }); - if (_name_to_id.contains(filter_name)) { - // Nested token filter policy - const auto& filter_policy = _policys[_name_to_id[filter_name]]; - auto type_it = filter_policy.properties.find(PROP_TYPE); - if (type_it == filter_policy.properties.end()) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "Invalid token filter configuration in policy: " + filter_name); - } + auto custom_analyzer_config = builder.build(); + return segment_v2::inverted_index::CustomAnalyzer::build_custom_analyzer( + custom_analyzer_config); +} + +void IndexPolicyMgr::process_filter_configs( + const TIndexPolicy& index_policy_analyzer, const std::string& prop_name, + const std::string& error_prefix, + std::function + add_config_func) { + auto filter_it = index_policy_analyzer.properties.find(prop_name); + if (filter_it == index_policy_analyzer.properties.end()) { + return; + } + + std::vector filter_strs; + boost::split(filter_strs, filter_it->second, boost::is_any_of(",")); - segment_v2::inverted_index::Settings settings; - for (const auto& prop : filter_policy.properties) { - if (prop.first != PROP_TYPE) { - settings.set(prop.first, prop.second); - } + for (auto& filter_name : filter_strs) { + boost::trim(filter_name); + if (filter_name.empty()) { + continue; + } + + if (_name_to_id.contains(filter_name)) { + // Nested filter policy + const auto& filter_policy = _policys[_name_to_id[filter_name]]; + auto type_it = filter_policy.properties.find(PROP_TYPE); + if (type_it == filter_policy.properties.end()) { + throw Exception( + ErrorCode::INVALID_ARGUMENT, + "Invalid " + error_prefix + " configuration in policy: " + filter_name); + } + + segment_v2::inverted_index::Settings settings; + for (const auto& prop : filter_policy.properties) { + if (prop.first != PROP_TYPE) { + settings.set(prop.first, prop.second); } - builder.add_token_filter_config(type_it->second, settings); - } else { - // Simple token filter - builder.add_token_filter_config(filter_name, {}); } + add_config_func(type_it->second, settings); + } else { + // Simple filter + add_config_func(filter_name, {}); } } - - auto custom_analyzer_config = builder.build(); - return segment_v2::inverted_index::CustomAnalyzer::build_custom_analyzer( - custom_analyzer_config); } } // namespace doris \ No newline at end of file diff --git a/be/src/runtime/index_policy/index_policy_mgr.h b/be/src/runtime/index_policy/index_policy_mgr.h index aa0b25a0448bf7..707270930fe4cb 100644 --- a/be/src/runtime/index_policy/index_policy_mgr.h +++ b/be/src/runtime/index_policy/index_policy_mgr.h @@ -40,9 +40,16 @@ class IndexPolicyMgr { private: constexpr static auto PROP_TOKENIZER = "tokenizer"; + constexpr static auto PROP_CHAR_FILTER = "char_filter"; constexpr static auto PROP_TOKEN_FILTER = "token_filter"; constexpr static auto PROP_TYPE = "type"; + void process_filter_configs( + const TIndexPolicy& index_policy_analyzer, const std::string& prop_name, + const std::string& error_prefix, + std::function + add_config_func); + std::shared_mutex _mutex; Policys _policys; diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 151c0bb1e0b81c..02e597169ef636 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -105,8 +105,8 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, auto reader = InvertedIndexAnalyzer::create_reader(inverted_index_ctx.char_filter_map); reader->init(tokenize_str.data, (int)tokenize_str.size, true); - auto analyzer_tokens = InvertedIndexAnalyzer::get_analyse_result( - reader.get(), inverted_index_ctx.analyzer); + auto analyzer_tokens = + InvertedIndexAnalyzer::get_analyse_result(reader, inverted_index_ctx.analyzer); rapidjson::Document doc; doc.SetArray(); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 3875a73a0c5f66..33c8e34eb3b565 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -188,7 +188,7 @@ std::vector FunctionMatchBase::analyse_query_str_token( inverted_index_ctx->char_filter_map); reader->init(match_query_str.data(), (int)match_query_str.size(), true); query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer); + reader, inverted_index_ctx->analyzer); return query_tokens; } @@ -211,7 +211,7 @@ inline std::vector FunctionMatchBase::analyse_data_token( data_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer); + reader, inverted_index_ctx->analyzer); } } else { const auto& str_ref = string_col->get_data_at(current_block_row_idx); @@ -224,7 +224,7 @@ inline std::vector FunctionMatchBase::analyse_data_token( reader->init(str_ref.data, (int)str_ref.size, true); data_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer); + reader, inverted_index_ctx->analyzer); } } return data_tokens; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp index 4cd6d180a2e911..eae5e59ffb2e62 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp @@ -35,11 +35,11 @@ class ICUTokenizerTest : public ::testing::Test { analyzer.initDict("./be/dict/icu"); analyzer.set_lowercase(false); - lucene::util::SStringReader reader; - reader.init(s.data(), s.size(), false); + auto reader = std::make_shared>(); + reader->init(s.data(), s.size(), false); - std::unique_ptr tokenizer; - tokenizer.reset((ICUTokenizer*)analyzer.tokenStream(L"", &reader)); + std::unique_ptr tokenizer; + tokenizer.reset((inverted_index::ICUTokenizer*)analyzer.tokenStream(L"", reader)); Token t; while (tokenizer->next(&t)) { @@ -572,4 +572,55 @@ TEST_F(ICUTokenizerTest, TestICUScriptExtensions) { } } +TEST_F(ICUTokenizerTest, TestICUAnalyzerCreateComponentsWithLowercase) { + std::vector datas; + + ICUAnalyzer analyzer; + analyzer.initDict("./be/dict/icu"); + analyzer.set_lowercase(true); + + std::string text = "Mixed Case TEXT with Numbers 123."; + auto reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + auto token_stream = analyzer.tokenStream(L"", reader); + ASSERT_NE(token_stream, nullptr); + + Token t; + while (token_stream->next(&t)) { + std::string term(t.termBuffer(), t.termLength()); + datas.emplace_back(term); + } + + std::vector expected = {"mixed", "case", "text", "with", "numbers", "123"}; + ASSERT_EQ(datas.size(), expected.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expected[i]); + } + + delete token_stream; +} + +TEST_F(ICUTokenizerTest, TestICUAnalyzerTokenStreamThrowsException) { + ICUAnalyzer analyzer; + analyzer.initDict("./be/dict/icu"); + + std::string text = "Hello World!"; + auto reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + EXPECT_THROW({ analyzer.tokenStream(L"", reader.get()); }, Exception); +} + +TEST_F(ICUTokenizerTest, TestICUAnalyzerReusableTokenStreamThrowsException) { + ICUAnalyzer analyzer; + analyzer.initDict("./be/dict/icu"); + + std::string text = "Hello World!"; + auto reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + EXPECT_THROW({ analyzer.reusableTokenStream(L"", reader.get()); }, Exception); +} + } // namespace doris::segment_v2 diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp index 6dba8233a2ed91..6b36abcd56db72 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp @@ -31,11 +31,11 @@ std::vector tokenize(const std::string& s, bool lowercase = false) BasicAnalyzer analyzer; analyzer.set_lowercase(lowercase); - lucene::util::SStringReader reader; - reader.init(s.data(), s.size(), false); + auto reader = std::make_shared>(); + reader->init(s.data(), s.size(), false); - std::unique_ptr tokenizer; - tokenizer.reset((BasicTokenizer*)analyzer.tokenStream(L"", &reader)); + std::unique_ptr tokenizer; + tokenizer.reset((inverted_index::BasicTokenizer*)analyzer.tokenStream(L"", reader)); Token t; while (tokenizer->next(&t)) { diff --git a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/custom_analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/custom_analyzer_test.cpp index f71a48ee67c6cf..0f18b4d5e600b4 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/custom_analyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/custom_analyzer_test.cpp @@ -94,12 +94,12 @@ class CustomAnalyzerTest : public ::testing::Test { }; int32_t tokenize(const CustomAnalyzerPtr& custom_analyzer, const std::vector& lines) { - lucene::util::SStringReader reader; + auto reader = std::make_shared>(); size_t total_count = 0; Token t; for (size_t i = 0; i < lines.size(); ++i) { - reader.init(lines[i].data(), lines[i].size(), false); - auto* token_stream = custom_analyzer->reusableTokenStream(L"", &reader); + reader->init(lines[i].data(), lines[i].size(), false); + auto* token_stream = custom_analyzer->reusableTokenStream(L"", reader); token_stream->reset(); while (token_stream->next(&t)) { total_count++; @@ -120,9 +120,9 @@ struct ExpectedToken { std::vector tokenize1(const CustomAnalyzerPtr& custom_analyzer, const std::string line) { std::vector results; - lucene::util::SStringReader reader; - reader.init(line.data(), line.size(), false); - auto* token_stream = custom_analyzer->reusableTokenStream(L"", &reader); + auto reader = std::make_shared>(); + reader->init(line.data(), line.size(), false); + auto* token_stream = custom_analyzer->reusableTokenStream(L"", reader); token_stream->reset(); Token t; while (token_stream->next(&t)) { @@ -196,9 +196,69 @@ TEST_F(CustomAnalyzerTest, CustomNgramAnalyzer) { } } +TEST_F(CustomAnalyzerTest, TokenStreamNotSupported) { + CustomAnalyzerConfig::Builder builder; + builder.with_tokenizer_config("standard", {}); + auto custom_analyzer_config = builder.build(); + auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); + + auto reader = std::make_shared>(); + reader->init("test content", 12, false); + + EXPECT_THROW({ custom_analyzer->tokenStream(L"field", reader.get()); }, Exception); + + EXPECT_THROW({ custom_analyzer->reusableTokenStream(L"field", reader.get()); }, Exception); +} + +TEST_F(CustomAnalyzerTest, ReusableTokenStreamNotSupported) { + CustomAnalyzerConfig::Builder builder; + builder.with_tokenizer_config("standard", {}); + auto custom_analyzer_config = builder.build(); + auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); + + auto reader = std::make_shared>(); + reader->init("test content", 12, false); + + EXPECT_THROW({ custom_analyzer->reusableTokenStream(L"field", reader.get()); }, Exception); + + try { + custom_analyzer->reusableTokenStream(L"field", reader.get()); + FAIL() << "Expected Exception to be thrown"; + } catch (const Exception& e) { + EXPECT_EQ(e.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_STREQ(e.what(), "[E-6001] CustomAnalyzer::reusableTokenStream not supported"); + } +} + +TEST_F(CustomAnalyzerTest, TokenStreamWithReaderPtr) { + CustomAnalyzerConfig::Builder builder; + builder.with_tokenizer_config("standard", {}); + builder.add_token_filter_config("lowercase", {}); + auto custom_analyzer_config = builder.build(); + auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); + + auto reader = std::make_shared>(); + reader->init("Hello World Test", 16, false); + + auto* token_stream = custom_analyzer->tokenStream(L"field", reader); + EXPECT_NE(token_stream, nullptr); + + Token t; + std::vector tokens; + token_stream->reset(); + while (token_stream->next(&t)) { + tokens.emplace_back(std::string(t.termBuffer(), t.termLength())); + } + + std::vector expected = {"hello", "world", "test"}; + EXPECT_EQ(tokens, expected); + + delete token_stream; +} + // TEST_F(CustomAnalyzerTest, test) { // std::string name = "name"; -// std::string path = "/mnt/disk2/yangsiyu/clucene/index"; +// std::string path = "/mnt/disk3/yangsiyu/clucene"; // std::vector lines; @@ -213,145 +273,157 @@ TEST_F(CustomAnalyzerTest, CustomNgramAnalyzer) { // std::cout << "lines size: " << lines.size() << std::endl; +// Settings char_replace_params; +// char_replace_params.set("char_filter_pattern", "_"); +// char_replace_params.set("char_filter_replacement", " "); + // Settings word_delimiter_params; // word_delimiter_params.set("preserve_original", "true"); // CustomAnalyzerConfig::Builder builder; // builder.with_tokenizer_config("standard", {}); -// builder.add_token_filter_config("word_delimiter", word_delimiter_params); +// builder.add_char_filter_config("char_replace", char_replace_params); +// // builder.add_token_filter_config("word_delimiter", word_delimiter_params); // // builder.add_token_filter_config("asciifolding", {}); -// // builder.add_token_filter_config("lowercase", {}); +// builder.add_token_filter_config("lowercase", {}); // auto custom_analyzer_config = builder.build(); // auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); -// { -// TimeGuard t("load time"); +// auto result = tokenize1(custom_analyzer, lines[0]); +// for (const auto& token : result) { +// std::cout << token.term << " " << token.pos << std::endl; +// } -// lucene::index::IndexWriter indexwriter(path.c_str(), custom_analyzer.get(), true); -// indexwriter.setRAMBufferSizeMB(512); -// indexwriter.setMaxFieldLength(0x7FFFFFFFL); -// indexwriter.setMergeFactor(1000000000); -// indexwriter.setUseCompoundFile(false); +// // { +// // TimeGuard t("load time"); -// lucene::util::SStringReader reader; +// // lucene::index::IndexWriter indexwriter(path.c_str(), custom_analyzer.get(), true); +// // indexwriter.setRAMBufferSizeMB(512); +// // indexwriter.setMaxFieldLength(0x7FFFFFFFL); +// // indexwriter.setMergeFactor(1000000000); +// // indexwriter.setUseCompoundFile(false); -// lucene::document::Document doc; -// int32_t field_config = lucene::document::Field::STORE_NO; -// field_config |= lucene::document::Field::INDEX_NONORMS; -// field_config |= lucene::document::Field::INDEX_TOKENIZED; -// auto field_name = std::wstring(name.begin(), name.end()); -// auto* field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); -// field->setOmitTermFreqAndPositions(false); -// doc.add(*field); +// // auto reader = std::make_shared>(); -// for (int32_t j = 0; j < 1; j++) { -// for (size_t k = 0; k < lines.size(); k++) { -// reader.init(lines[k].data(), lines[k].size(), false); -// auto* stream = custom_analyzer->reusableTokenStream(field->name(), &reader); -// field->setValue(stream); +// // lucene::document::Document doc; +// // int32_t field_config = lucene::document::Field::STORE_NO; +// // field_config |= lucene::document::Field::INDEX_NONORMS; +// // field_config |= lucene::document::Field::INDEX_TOKENIZED; +// // auto field_name = std::wstring(name.begin(), name.end()); +// // auto* field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); +// // field->setOmitTermFreqAndPositions(false); +// // doc.add(*field); -// indexwriter.addDocument(&doc); -// } -// } +// // for (int32_t j = 0; j < 1; j++) { +// // for (size_t k = 0; k < lines.size(); k++) { +// // reader->init(lines[k].data(), lines[k].size(), false); +// // auto* stream = custom_analyzer->reusableTokenStream(field->name(), reader); +// // field->setValue(stream); -// std::cout << "---------------------" << std::endl; +// // indexwriter.addDocument(&doc); +// // } +// // } -// indexwriter.close(); -// } +// // std::cout << "---------------------" << std::endl; -// std::cout << "-----------" << std::endl; - -// try { -// { -// auto* dir = FSDirectory::getDirectory(path.c_str()); -// auto* reader = IndexReader::open(dir, 1024 * 1024, true); -// auto searcher = std::make_shared(reader); - -// // std::cout << "macDoc: " << reader->maxDoc() << std::endl; - -// { -// TimeGuard time("query time"); - -// { -// TQueryOptions query_options; -// doris::segment_v2::PhraseQuery query(searcher, query_options, nullptr); - -// InvertedIndexQueryInfo query_info; -// query_info.field_name = L"name"; -// { -// doris::segment_v2::TermInfo t; -// t.term = "Super_Duper"; -// t.position = 1; -// query_info.term_infos.emplace_back(std::move(t)); -// } -// { -// doris::segment_v2::TermInfo t; -// t.term = "Super"; -// t.position = 1; -// query_info.term_infos.emplace_back(std::move(t)); -// } -// { -// doris::segment_v2::TermInfo t; -// t.term = "Duper"; -// t.position = 2; -// query_info.term_infos.emplace_back(std::move(t)); -// } -// { -// doris::segment_v2::TermInfo t; -// t.term = "c"; -// t.position = 3; -// query_info.term_infos.emplace_back(std::move(t)); -// } -// query_info.slop = 1; -// query_info.ordered = true; -// query.add(query_info); - -// roaring::Roaring result; -// query.search(result); - -// std::cout << "phrase_query count: " << result.cardinality() << std::endl; -// } -// // { -// // TQueryOptions query_options; -// // doris::segment_v2::PhrasePrefixQuery query(searcher, query_options, nullptr); - -// // InvertedIndexQueryInfo query_info; -// // query_info.field_name = L"name"; -// // { -// // doris::segment_v2::TermInfo t; -// // t.term = "Super_Duper"; -// // t.position = 1; -// // query_info.term_infos.emplace_back(std::move(t)); -// // } -// // { -// // doris::segment_v2::TermInfo t; -// // t.term = "Super"; -// // t.position = 1; -// // query_info.term_infos.emplace_back(std::move(t)); -// // } -// // { -// // doris::segment_v2::TermInfo t; -// // t.term = "Dup"; -// // t.position = 2; -// // query_info.term_infos.emplace_back(std::move(t)); -// // } -// // query.add(query_info); - -// // roaring::Roaring result; -// // query.search(result); - -// // std::cout << "phrase_prefix_query count: " << result.cardinality() << std::endl; -// // } -// } - -// reader->close(); -// _CLLDELETE(reader); -// _CLDECDELETE(dir); -// } -// } catch (const CLuceneError& e) { -// std::cout << e.number() << ": " << e.what() << std::endl; -// } +// // indexwriter.close(); +// // } + +// // std::cout << "-----------" << std::endl; + +// // try { +// // { +// // auto* dir = FSDirectory::getDirectory(path.c_str()); +// // auto* reader = IndexReader::open(dir, 1024 * 1024, true); +// // auto searcher = std::make_shared(reader); + +// // // std::cout << "macDoc: " << reader->maxDoc() << std::endl; + +// // { +// // TimeGuard time("query time"); + +// // { +// // IndexQueryContextPtr context = std::make_shared(); + +// // TQueryOptions query_options; +// // doris::segment_v2::PhraseQuery query(searcher, context); + +// // InvertedIndexQueryInfo query_info; +// // query_info.field_name = L"name"; +// // { +// // doris::segment_v2::TermInfo t; +// // t.term = "Super_Duper"; +// // t.position = 1; +// // query_info.term_infos.emplace_back(std::move(t)); +// // } +// // { +// // doris::segment_v2::TermInfo t; +// // t.term = "Super"; +// // t.position = 1; +// // query_info.term_infos.emplace_back(std::move(t)); +// // } +// // { +// // doris::segment_v2::TermInfo t; +// // t.term = "Duper"; +// // t.position = 2; +// // query_info.term_infos.emplace_back(std::move(t)); +// // } +// // { +// // doris::segment_v2::TermInfo t; +// // t.term = "c"; +// // t.position = 3; +// // query_info.term_infos.emplace_back(std::move(t)); +// // } +// // query_info.slop = 1; +// // query_info.ordered = true; +// // query.add(query_info); + +// // roaring::Roaring result; +// // query.search(result); + +// // std::cout << "phrase_query count: " << result.cardinality() << std::endl; +// // } +// // // { +// // // TQueryOptions query_options; +// // // doris::segment_v2::PhrasePrefixQuery query(searcher, query_options, nullptr); + +// // // InvertedIndexQueryInfo query_info; +// // // query_info.field_name = L"name"; +// // // { +// // // doris::segment_v2::TermInfo t; +// // // t.term = "Super_Duper"; +// // // t.position = 1; +// // // query_info.term_infos.emplace_back(std::move(t)); +// // // } +// // // { +// // // doris::segment_v2::TermInfo t; +// // // t.term = "Super"; +// // // t.position = 1; +// // // query_info.term_infos.emplace_back(std::move(t)); +// // // } +// // // { +// // // doris::segment_v2::TermInfo t; +// // // t.term = "Dup"; +// // // t.position = 2; +// // // query_info.term_infos.emplace_back(std::move(t)); +// // // } +// // // query.add(query_info); + +// // // roaring::Roaring result; +// // // query.search(result); + +// // // std::cout << "phrase_prefix_query count: " << result.cardinality() << std::endl; +// // // } +// // } + +// // reader->close(); +// // _CLLDELETE(reader); +// // _CLDECDELETE(dir); +// // } +// // } catch (const CLuceneError& e) { +// // std::cout << e.number() << ": " << e.what() << std::endl; +// // } // } } // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_test.cpp new file mode 100644 index 00000000000000..020f0db8d11199 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_test.cpp @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter.h" + +#include + +#include + +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2::inverted_index { + +class MockDorisCharFilter : public DorisCharFilter { +public: + MockDorisCharFilter(ReaderPtr reader) : DorisCharFilter(std::move(reader)) {} + ~MockDorisCharFilter() override = default; + + void initialize() override {} + + // 实现必需的虚函数 + void init(const void* _value, int32_t _length, bool copyData) override { + if (_reader) { + _reader->init(_value, _length, copyData); + } + } + + int32_t read(const void** start, int32_t min, int32_t max) override { + if (_reader) { + return _reader->read(start, min, max); + } + return -1; + } + + int32_t readCopy(void* start, int32_t off, int32_t len) override { + if (_reader) { + return _reader->readCopy(start, off, len); + } + return -1; + } +}; + +class DorisCharFilterTest : public ::testing::Test { +protected: + void SetUp() override { _mock_reader = std::make_shared>(); } + + ReaderPtr _mock_reader; +}; + +TEST_F(DorisCharFilterTest, ExceptionThrowing) { + auto filter = std::make_shared(_mock_reader); + + EXPECT_THROW(filter->position(), doris::Exception); + + EXPECT_THROW(filter->skip(10), doris::Exception); + + EXPECT_THROW(filter->size(), doris::Exception); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory_test.cpp new file mode 100644 index 00000000000000..28bcc33f9ca3ea --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory_test.cpp @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h" + +#include + +namespace doris::segment_v2::inverted_index { + +ReaderPtr create_char_replace_filter(const std::string& text, const std::string& pattern, + const std::string& replacement = " ") { + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + Settings settings; + settings.set(CHAR_REPLACE_PATTERN, pattern); + settings.set(CHAR_REPLACE_REPLACEMENT, replacement); + + CharReplaceCharFilterFactory factory; + factory.initialize(settings); + auto char_filter = factory.create(reader); + return char_filter; +} + +struct ExpectedOutput { + std::string text; + std::string expected; +}; + +class CharReplaceCharFilterFactoryTest : public ::testing::Test { +protected: + void assert_char_filter_output(const std::string& input_text, const std::string& pattern, + const std::string& expected_output, + const std::string& replacement = " ") { + auto char_filter = create_char_replace_filter(input_text, pattern, replacement); + + const void* data = nullptr; + int32_t read_len = char_filter->read(&data, 0, char_filter->size()); + ASSERT_GT(read_len, 0) << "Failed to read from char filter"; + + std::string result(static_cast(data), read_len); + EXPECT_EQ(result, expected_output) << "Char filter output mismatch"; + } +}; + +TEST_F(CharReplaceCharFilterFactoryTest, BasicReplacement) { + assert_char_filter_output("hello,world", ",", "hello world"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, MultipleReplacements) { + assert_char_filter_output("a,b,c,d", ",", "a b c d"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, CustomReplacement) { + assert_char_filter_output("hello,world", ",", "hello_world", "_"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, MultiplePatternChars) { + assert_char_filter_output("a,b;c:d", ",;:", "a b c d"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, NoMatch) { + assert_char_filter_output("hello world", "x", "hello world"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, EmptyInput) { + auto char_filter = create_char_replace_filter("", ","); + + const void* data = nullptr; + int32_t read_len = char_filter->read(&data, 0, char_filter->size()); + + // For empty input, read should return -1 (EOF) + EXPECT_EQ(read_len, -1) << "Empty input should return EOF"; + EXPECT_TRUE(data == nullptr || char_filter->size() == 0) + << "No data should be available for empty input"; +} + +TEST_F(CharReplaceCharFilterFactoryTest, EmptyPattern) { + try { + assert_char_filter_output("hello,world", "", "hello,world"); + } catch (const Exception& e) { + EXPECT_EQ(e.code(), ErrorCode::INVALID_ARGUMENT); + } +} + +TEST_F(CharReplaceCharFilterFactoryTest, AllCharsMatch) { + assert_char_filter_output("abc", "abc", " "); +} + +TEST_F(CharReplaceCharFilterFactoryTest, ChineseCharacters) { + assert_char_filter_output("你好,世界", ",", "你好 世界"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, SpecialCharacters) { + assert_char_filter_output("test@example.com", "@.", "test example com"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, FactoryInitialization) { + Settings settings; + settings.set(CHAR_REPLACE_PATTERN, ","); + settings.set(CHAR_REPLACE_REPLACEMENT, " "); + + CharReplaceCharFilterFactory factory; + EXPECT_NO_THROW(factory.initialize(settings)); +} + +TEST_F(CharReplaceCharFilterFactoryTest, FactoryInitializationMissingPattern) { + Settings settings; + // Missing pattern - should throw exception + settings.set(CHAR_REPLACE_REPLACEMENT, " "); + + CharReplaceCharFilterFactory factory; + EXPECT_NO_THROW(factory.initialize(settings)); +} + +TEST_F(CharReplaceCharFilterFactoryTest, FactoryInitializationEmptyPattern) { + Settings settings; + settings.set(CHAR_REPLACE_PATTERN, ""); + settings.set(CHAR_REPLACE_REPLACEMENT, " "); + + CharReplaceCharFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(CharReplaceCharFilterFactoryTest, FactoryCreateFilter) { + Settings settings; + settings.set(CHAR_REPLACE_PATTERN, ","); + settings.set(CHAR_REPLACE_REPLACEMENT, " "); + + CharReplaceCharFilterFactory factory; + factory.initialize(settings); + + ReaderPtr input_reader = std::make_shared>(); + input_reader->init("test,data", 9, false); + + auto char_filter = factory.create(input_reader); + ASSERT_NE(char_filter, nullptr); + + const void* data = nullptr; + int32_t read_len = char_filter->read(&data, 0, char_filter->size()); + ASSERT_GT(read_len, 0); + + std::string result(static_cast(data), read_len); + EXPECT_EQ(result, "test data"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, DefaultReplacement) { + Settings settings; + settings.set(CHAR_REPLACE_PATTERN, ","); + // No replacement specified - should use default " " + + CharReplaceCharFilterFactory factory; + factory.initialize(settings); + + ReaderPtr input_reader = std::make_shared>(); + input_reader->init("a,b,c", 5, false); + + auto char_filter = factory.create(input_reader); + + const void* data = nullptr; + int32_t read_len = char_filter->read(&data, 0, char_filter->size()); + ASSERT_GT(read_len, 0); + + std::string result(static_cast(data), read_len); + EXPECT_EQ(result, "a b c"); +} + +TEST_F(CharReplaceCharFilterFactoryTest, EdgeCases) { + // Test with whitespace only + assert_char_filter_output(" ", ",", " "); + + // Test with only pattern characters + assert_char_filter_output(",,,", ",", " "); + + // Test with mixed content + assert_char_filter_output("a,,b,", ",", "a b "); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp index 62c04519ee1d0c..5e0e783ca8e4f1 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp @@ -89,7 +89,7 @@ class BooleanQueryTest : public testing::Test { indexwriter->setMergeFactor(1000000000); indexwriter->setUseCompoundFile(false); - auto* char_string_reader = _CLNEW lucene::util::SStringReader; + auto char_string_reader = std::make_shared>(); auto* doc = _CLNEW lucene::document::Document(); int32_t field_config = lucene::document::Field::STORE_NO; @@ -114,7 +114,6 @@ class BooleanQueryTest : public testing::Test { _CLLDELETE(indexwriter); _CLLDELETE(doc); - _CLLDELETE(char_string_reader); } }; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/setting_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/setting_test.cpp index 84dd8d11a42841..e616ae0362040a 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/setting_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/setting_test.cpp @@ -111,21 +111,19 @@ TEST_F(SettingsTest, GetStringReturnsCorrectValues) { TEST_F(SettingsTest, GetEntryListReturnsCorrectValues) { Settings settings(testMap); - auto emptyList = settings.get_entry_list("list_empty"); - EXPECT_TRUE(emptyList.empty()); + // auto emptyList = settings.get_entry_list("list_empty"); + // EXPECT_TRUE(emptyList.empty()); - auto singleList = settings.get_entry_list("list_single"); - ASSERT_EQ(singleList.size(), 1); - EXPECT_EQ(singleList[0], "item1"); + // auto singleList = settings.get_entry_list("list_single"); + // ASSERT_EQ(singleList.size(), 1); + // EXPECT_EQ(singleList[0], "item1"); auto multiList = settings.get_entry_list("list_multiple"); - ASSERT_EQ(multiList.size(), 3); - EXPECT_EQ(multiList[0], "item1"); - EXPECT_EQ(multiList[1], "item2"); - EXPECT_EQ(multiList[2], "item3"); + ASSERT_EQ(multiList.size(), 1); + EXPECT_EQ(multiList[0], "item1][item2][item3"); - auto nonExistent = settings.get_entry_list("non_existent"); - EXPECT_TRUE(nonExistent.empty()); + // auto nonExistent = settings.get_entry_list("non_existent"); + // EXPECT_TRUE(nonExistent.empty()); } TEST_F(SettingsTest, GetWordSetReturnsCorrectValues) { @@ -181,4 +179,79 @@ TEST_F(SettingsTest, MoveConstructorWorks) { EXPECT_TRUE(settings1.empty()); } +TEST_F(SettingsTest, GetEntryListWithBracketsInside) { + Settings settings; + + settings.set("list_with_brackets_inside", "[item[with]brackets]"); + auto singleWithBrackets = settings.get_entry_list("list_with_brackets_inside"); + ASSERT_EQ(singleWithBrackets.size(), 1); + EXPECT_EQ(singleWithBrackets[0], "item[with]brackets"); + + settings.set("list_multiple_with_brackets", + "[item1[with]brackets][item2[also]has[brackets]][item3]"); + auto multiWithBrackets = settings.get_entry_list("list_multiple_with_brackets"); + ASSERT_EQ(multiWithBrackets.size(), 1); + EXPECT_EQ(multiWithBrackets[0], "item1[with]brackets][item2[also]has[brackets]][item3"); + + settings.set("list_nested_brackets", "[[[nested]]][[double]][single]"); + auto nestedBrackets = settings.get_entry_list("list_nested_brackets"); + ASSERT_EQ(nestedBrackets.size(), 1); + EXPECT_EQ(nestedBrackets[0], "[[nested]]][[double]][single"); + + settings.set("list_empty_brackets_inside", "[item[]with][empty][]brackets"); + EXPECT_THROW(settings.get_entry_list("list_empty_brackets_inside"), Exception); +} + +TEST_F(SettingsTest, GetEntryListWithCommaSeparators) { + Settings settings; + + settings.set("list_comma_separated", "[item1],[item2],[item3]"); + auto commaList = settings.get_entry_list("list_comma_separated"); + ASSERT_EQ(commaList.size(), 3); + EXPECT_EQ(commaList[0], "item1"); + EXPECT_EQ(commaList[1], "item2"); + EXPECT_EQ(commaList[2], "item3"); + + settings.set("list_with_empty_items", "[item1],[],[item3]"); + auto listWithEmpty = settings.get_entry_list("list_with_empty_items"); + ASSERT_EQ(listWithEmpty.size(), 2); + EXPECT_EQ(listWithEmpty[0], "item1"); + EXPECT_EQ(listWithEmpty[1], "item3"); + + settings.set("list_comma_separated1", "[item1], [item2], [item3]"); + auto commaList1 = settings.get_entry_list("list_comma_separated1"); + ASSERT_EQ(commaList1.size(), 3); + EXPECT_EQ(commaList1[0], "item1"); + EXPECT_EQ(commaList1[1], "item2"); + EXPECT_EQ(commaList1[2], "item3"); + + settings.set("list_with_empty_items1", ""); + auto listWithEmpty1 = settings.get_entry_list("list_with_empty_items1"); + ASSERT_EQ(listWithEmpty1.size(), 0); +} + +TEST_F(SettingsTest, GetEntryListSpecExamples) { + Settings settings; + + settings.set("ex_empty", "[]"); + auto v0 = settings.get_entry_list("ex_empty"); + EXPECT_TRUE(v0.empty()); + + settings.set("ex_nested_balanced", "[[123]]"); + auto v1 = settings.get_entry_list("ex_nested_balanced"); + ASSERT_EQ(v1.size(), 1); + EXPECT_EQ(v1[0], "[123]"); + + settings.set("ex_nested_unbalanced_inside", "[[123[]"); + auto v2 = settings.get_entry_list("ex_nested_unbalanced_inside"); + ASSERT_EQ(v2.size(), 1); + EXPECT_EQ(v2[0], "[123["); + + settings.set("ex_no_comma_multiple", "[123][123"); + EXPECT_THROW(settings.get_entry_list("ex_no_comma_multiple"), Exception); + + settings.set("ex_comma_missing_closing", "[123],[123"); + EXPECT_THROW(settings.get_entry_list("ex_comma_missing_closing"), Exception); +} + } // namespace doris::segment_v2::inverted_index diff --git a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory_test.cpp index 8187a8bf64cc60..ce659971367ffc 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory_test.cpp @@ -24,14 +24,14 @@ namespace doris::segment_v2::inverted_index { TokenStreamPtr create_filter(const std::string& text, Settings token_filter_settings) { - static lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); Settings settings; KeywordTokenizerFactory tokenizer_factory; tokenizer_factory.initialize(settings); auto tokenizer = tokenizer_factory.create(); - tokenizer->set_reader(&reader); + tokenizer->set_reader(reader); ASCIIFoldingFilterFactory token_filter_factory; token_filter_factory.initialize(token_filter_settings); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory_test.cpp index 588dc0381a1fbf..010c3b053b46d4 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory_test.cpp @@ -25,13 +25,13 @@ namespace doris::segment_v2::inverted_index { TokenStreamPtr create_lowercase_filter(const std::string& text, Settings settings = Settings()) { - static lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); KeywordTokenizerFactory tokenizer_factory; tokenizer_factory.initialize(Settings()); auto tokenizer = tokenizer_factory.create(); - tokenizer->set_reader(&reader); + tokenizer->set_reader(reader); LowerCaseFilterFactory filter_factory; filter_factory.initialize(settings); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_test.cpp index 4d8fa8545daaab..fbbecb8c021f3d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_test.cpp @@ -31,14 +31,14 @@ namespace doris::segment_v2::inverted_index { TokenStreamPtr create_filter(const std::string& text, int32_t flags, const std::unordered_set& prot_words = {}) { - static lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); Settings settings; KeywordTokenizerFactory tokenizer_factory; tokenizer_factory.initialize(settings); auto tokenizer = tokenizer_factory.create(); - tokenizer->set_reader(&reader); + tokenizer->set_reader(reader); auto token_filter = std::make_shared( tokenizer, WordDelimiterIterator::DEFAULT_WORD_DELIM_TABLE, flags, prot_words); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/basic_tokenizer_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/basic_tokenizer_factory_test.cpp new file mode 100644 index 00000000000000..49386133377c28 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/basic_tokenizer_factory_test.cpp @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h" + +#include + +namespace doris::segment_v2::inverted_index { + +TokenStreamPtr create_basic_tokenizer(const std::string& text, Settings settings = Settings()) { + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + BasicTokenizerFactory factory; + factory.initialize(settings); + auto tokenizer = factory.create(); + tokenizer->set_reader(reader); + tokenizer->reset(); + return tokenizer; +} + +struct ExpectedToken { + std::string term; + int pos_inc; +}; + +class BasicTokenizerFactoryTest : public ::testing::Test { +protected: + void assert_tokenizer_output(const std::string& text, + const std::vector& expected, + BasicTokenizerMode mode = BasicTokenizerMode::L1) { + Settings settings; + settings.set("mode", std::to_string(static_cast(mode))); + auto tokenizer = create_basic_tokenizer(text, settings); + + Token t; + size_t i = 0; + while (tokenizer->next(&t)) { + ASSERT_LT(i, expected.size()) << "More tokens produced than expected"; + std::string term(t.termBuffer(), t.termLength()); + EXPECT_EQ(term, expected[i].term) << "Term mismatch at index " << i; + EXPECT_EQ(t.getPositionIncrement(), expected[i].pos_inc) + << "Pos increment mismatch at index " << i; + ++i; + } + EXPECT_EQ(i, expected.size()) << "Number of tokens mismatch"; + } +}; + +TEST_F(BasicTokenizerFactoryTest, BasicTokenizationL1) { + // Test L1 mode: English + numbers + Chinese tokenization + assert_tokenizer_output("Hello world!", {{"Hello", 1}, {"world", 1}}, BasicTokenizerMode::L1); +} + +TEST_F(BasicTokenizerFactoryTest, ChineseTokenizationL1) { + // Test L1 mode with Chinese characters + assert_tokenizer_output("你好世界", {{"你", 1}, {"好", 1}, {"世", 1}, {"界", 1}}, + BasicTokenizerMode::L1); +} + +TEST_F(BasicTokenizerFactoryTest, MixedLanguageL1) { + // Test L1 mode with mixed English and Chinese + assert_tokenizer_output( + "Hello你好World世界", + {{"Hello", 1}, {"你", 1}, {"好", 1}, {"World", 1}, {"世", 1}, {"界", 1}}, + BasicTokenizerMode::L1); +} + +TEST_F(BasicTokenizerFactoryTest, NumbersAndPunctuationL1) { + // Test L1 mode with numbers and punctuation + assert_tokenizer_output("Version 2.0 版本", + {{"Version", 1}, {"2", 1}, {"0", 1}, {"版", 1}, {"本", 1}}, + BasicTokenizerMode::L1); +} + +TEST_F(BasicTokenizerFactoryTest, BasicTokenizationL2) { + // Test L2 mode: L1 + all Unicode characters tokenized + assert_tokenizer_output("Hello world!", {{"Hello", 1}, {"world", 1}, {"!", 1}}, + BasicTokenizerMode::L2); +} + +TEST_F(BasicTokenizerFactoryTest, UnicodeTokenizationL2) { + // Test L2 mode with various Unicode characters + assert_tokenizer_output("Hello��世界", {{"Hello", 1}, {"�", 1}, {"�", 1}, {"世", 1}, {"界", 1}}, + BasicTokenizerMode::L2); +} + +TEST_F(BasicTokenizerFactoryTest, WhitespaceHandlingL2) { + // Test L2 mode skips whitespace + assert_tokenizer_output("Hello world", {{"Hello", 1}, {"world", 1}}, BasicTokenizerMode::L2); +} + +TEST_F(BasicTokenizerFactoryTest, FactoryInitialization) { + Settings settings; + settings.set("mode", "1"); + + BasicTokenizerFactory factory; + factory.initialize(settings); + + auto tokenizer = factory.create(); + auto basic_tokenizer = std::dynamic_pointer_cast(tokenizer); + ASSERT_NE(basic_tokenizer, nullptr); +} + +TEST_F(BasicTokenizerFactoryTest, FactoryInitializationL2) { + Settings settings; + settings.set("mode", "2"); + + BasicTokenizerFactory factory; + factory.initialize(settings); + + auto tokenizer = factory.create(); + auto basic_tokenizer = std::dynamic_pointer_cast(tokenizer); + ASSERT_NE(basic_tokenizer, nullptr); +} + +TEST_F(BasicTokenizerFactoryTest, DefaultMode) { + // Test default mode (L1) when no mode is specified + Settings settings; + BasicTokenizerFactory factory; + factory.initialize(settings); + + auto tokenizer = factory.create(); + auto basic_tokenizer = std::dynamic_pointer_cast(tokenizer); + ASSERT_NE(basic_tokenizer, nullptr); +} + +TEST_F(BasicTokenizerFactoryTest, InvalidMode) { + Settings settings; + settings.set("mode", "3"); // Invalid mode + + BasicTokenizerFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(BasicTokenizerFactoryTest, InvalidModeZero) { + Settings settings; + settings.set("mode", "0"); // Invalid mode + + BasicTokenizerFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(BasicTokenizerFactoryTest, InvalidModeNegative) { + Settings settings; + settings.set("mode", "-1"); // Invalid mode + + BasicTokenizerFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(BasicTokenizerFactoryTest, EdgeCases) { + // Test empty string + assert_tokenizer_output("", {}, BasicTokenizerMode::L1); + + // Test whitespace only + assert_tokenizer_output(" ", {}, BasicTokenizerMode::L1); + + // Test punctuation only (L1 mode should skip non-Chinese punctuation) + assert_tokenizer_output("...", {}, BasicTokenizerMode::L1); + + // Test punctuation only (L2 mode should tokenize punctuation) + assert_tokenizer_output("...", {{".", 1}, {".", 1}, {".", 1}}, BasicTokenizerMode::L2); +} + +TEST_F(BasicTokenizerFactoryTest, LongText) { + // Test with longer text + std::string long_text = "This is a long text with multiple words and 中文 characters"; + std::vector expected = { + {"This", 1}, {"is", 1}, {"a", 1}, {"long", 1}, {"text", 1}, {"with", 1}, + {"multiple", 1}, {"words", 1}, {"and", 1}, {"中", 1}, {"文", 1}, {"characters", 1}}; + assert_tokenizer_output(long_text, expected, BasicTokenizerMode::L1); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp index 7d2782098a1a5e..91d978f83fa330 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp @@ -29,9 +29,9 @@ class CharGroupTokenizerTest : public ::testing::Test { std::vector tokens; auto tokenizer = factory.create(); { - lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); - tokenizer->set_reader(&reader); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + tokenizer->set_reader(reader); tokenizer->reset(); Token t; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/edge_ngram_tokenizer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/edge_ngram_tokenizer_test.cpp index 0d9f1802952e6c..00b22b484493f2 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/edge_ngram_tokenizer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/edge_ngram_tokenizer_test.cpp @@ -33,9 +33,9 @@ std::vector tokenize(EdgeNGramTokenizerFactory& factory, const std: std::vector tokens; auto tokenizer = factory.create(); { - lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); - tokenizer->set_reader(&reader); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + tokenizer->set_reader(reader); tokenizer->reset(); Token t; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/icu_tokenizer_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/icu_tokenizer_factory_test.cpp new file mode 100644 index 00000000000000..fb736ecaa76628 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/icu_tokenizer_factory_test.cpp @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h" + +#include + +namespace doris::segment_v2::inverted_index { + +TokenStreamPtr create_icu_tokenizer(const std::string& text, Settings settings = Settings()) { + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + + ICUTokenizerFactory factory; + factory.initialize(settings); + auto tokenizer = factory.create(); + tokenizer->set_reader(reader); + tokenizer->reset(); + return tokenizer; +} + +struct ExpectedToken { + std::string term; + int pos_inc; +}; + +class ICUTokenizerFactoryTest : public ::testing::Test { +protected: + void SetUp() override { + original_dict_path_ = config::inverted_index_dict_path; + + constexpr static uint32_t MAX_PATH_LEN = 1024; + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + std::string current_dir = std::string(buffer); + + config::inverted_index_dict_path = current_dir + "/be/dict"; + } + + void TearDown() override { config::inverted_index_dict_path = original_dict_path_; } + + void assert_tokenizer_output(const std::string& text, + const std::vector& expected) { + auto tokenizer = create_icu_tokenizer(text); + + Token t; + size_t i = 0; + while (tokenizer->next(&t)) { + ASSERT_LT(i, expected.size()) << "More tokens produced than expected"; + std::string term(t.termBuffer(), t.termLength()); + EXPECT_EQ(term, expected[i].term) << "Term mismatch at index " << i; + EXPECT_EQ(t.getPositionIncrement(), expected[i].pos_inc) + << "Pos increment mismatch at index " << i; + ++i; + } + EXPECT_EQ(i, expected.size()) << "Number of tokens mismatch"; + } + +private: + std::string original_dict_path_; +}; + +TEST_F(ICUTokenizerFactoryTest, BasicEnglishTokenization) { + // Test basic English tokenization + assert_tokenizer_output("Hello world!", {{"Hello", 1}, {"world", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, EnglishWithPunctuation) { + // Test English with punctuation + assert_tokenizer_output( + "This is a test, with punctuation!", + {{"This", 1}, {"is", 1}, {"a", 1}, {"test", 1}, {"with", 1}, {"punctuation", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, EnglishWithNumbers) { + // Test English with numbers + assert_tokenizer_output( + "Version 2.0 was released in 2023", + {{"Version", 1}, {"2.0", 1}, {"was", 1}, {"released", 1}, {"in", 1}, {"2023", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, ChineseTokenization) { + // Test Chinese character tokenization + assert_tokenizer_output("你好世界", {{"你好", 1}, {"世界", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, MixedLanguage) { + // Test mixed English and Chinese + assert_tokenizer_output("Hello你好World世界", + {{"Hello", 1}, {"你好", 1}, {"World", 1}, {"世界", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, JapaneseTokenization) { + // Test Japanese tokenization (Hiragana, Katakana, Kanji) + assert_tokenizer_output("こんにちは世界", {{"こんにちは", 1}, {"世界", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, KoreanTokenization) { + // Test Korean tokenization + assert_tokenizer_output("안녕하세요", {{"안녕하세요", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, ArabicTokenization) { + // Test Arabic tokenization + assert_tokenizer_output("مرحبا بالعالم", {{"مرحبا", 1}, {"بالعالم", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, CyrillicTokenization) { + // Test Cyrillic (Russian) tokenization + assert_tokenizer_output("Привет мир", {{"Привет", 1}, {"мир", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, EmojiAndSymbols) { + // Test emoji and special symbols + assert_tokenizer_output("Hello 😀 world 🌍", + {{"Hello", 1}, {"😀", 1}, {"world", 1}, {"🌍", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, WhitespaceHandling) { + // Test whitespace handling + assert_tokenizer_output("Hello world\t\n", {{"Hello", 1}, {"world", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, FactoryInitialization) { + Settings settings; + ICUTokenizerFactory factory; + factory.initialize(settings); + + auto tokenizer = factory.create(); + auto icu_tokenizer = std::dynamic_pointer_cast(tokenizer); + ASSERT_NE(icu_tokenizer, nullptr); +} + +TEST_F(ICUTokenizerFactoryTest, FactoryCreateMultipleInstances) { + ICUTokenizerFactory factory; + factory.initialize(Settings {}); + + auto tokenizer1 = factory.create(); + auto tokenizer2 = factory.create(); + + ASSERT_NE(tokenizer1, tokenizer2); + ASSERT_NE(tokenizer1, nullptr); + ASSERT_NE(tokenizer2, nullptr); +} + +TEST_F(ICUTokenizerFactoryTest, EdgeCases) { + // Test empty string + assert_tokenizer_output("", {}); + + // Test whitespace only + assert_tokenizer_output(" \t\n", {}); + + // Test single character + assert_tokenizer_output("a", {{"a", 1}}); + + // Test single Chinese character + assert_tokenizer_output("中", {{"中", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, LongText) { + // Test with longer text + std::string long_text = + "This is a long text with multiple words and 中文 characters and 日本語 text"; + std::vector expected = { + {"This", 1}, {"is", 1}, {"a", 1}, {"long", 1}, {"text", 1}, + {"with", 1}, {"multiple", 1}, {"words", 1}, {"and", 1}, {"中文", 1}, + {"characters", 1}, {"and", 1}, {"日本語", 1}, {"text", 1}}; + assert_tokenizer_output(long_text, expected); +} + +TEST_F(ICUTokenizerFactoryTest, SpecialCharacters) { + // Test special characters and symbols + assert_tokenizer_output("Price: $100.50 (USD)", {{"Price", 1}, {"100.50", 1}, {"USD", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, URLAndEmail) { + // Test URL and email handling + assert_tokenizer_output("Visit https://example.com or email test@example.com", + {{"Visit", 1}, + {"https", 1}, + {"example.com", 1}, + {"or", 1}, + {"email", 1}, + {"test", 1}, + {"example.com", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, CaseSensitivity) { + // Test case sensitivity (ICU tokenizer should preserve case by default) + assert_tokenizer_output("Hello WORLD Test", {{"Hello", 1}, {"WORLD", 1}, {"Test", 1}}); +} + +TEST_F(ICUTokenizerFactoryTest, UnicodeNormalization) { + // Test Unicode normalization + assert_tokenizer_output("café naïve résumé", {{"café", 1}, {"naïve", 1}, {"résumé", 1}}); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/keyword_analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/keyword_analyzer_test.cpp index 1a8283fd1ac49b..0b334b1492d51f 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/keyword_analyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/keyword_analyzer_test.cpp @@ -32,9 +32,9 @@ std::vector tokenize(KeywordTokenizerFactory& factory, const std::s std::vector tokens; auto tokenizer = factory.create(); { - lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); - tokenizer->set_reader(&reader); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + tokenizer->set_reader(reader); tokenizer->reset(); Token t; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/ngram_tokenizer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/ngram_tokenizer_test.cpp index 9b55aae48aa7c0..2249a45d4d37fd 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/ngram_tokenizer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/ngram_tokenizer_test.cpp @@ -35,9 +35,9 @@ std::vector tokenize(NGramTokenizerFactory& factory, const std::str std::vector tokens; auto tokenizer = factory.create(); { - lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); - tokenizer->set_reader(&reader); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); + tokenizer->set_reader(reader); tokenizer->reset(); Token t; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/standard_tokenizer_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/standard_tokenizer_factory_test.cpp index 5cbc56890961a3..631e95745bbab2 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/standard_tokenizer_factory_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/standard_tokenizer_factory_test.cpp @@ -22,13 +22,13 @@ namespace doris::segment_v2::inverted_index { TokenStreamPtr create_standard_tokenizer(const std::string& text, Settings settings = Settings()) { - static lucene::util::SStringReader reader; - reader.init(text.data(), text.size(), false); + ReaderPtr reader = std::make_shared>(); + reader->init(text.data(), text.size(), false); StandardTokenizerFactory factory; factory.initialize(settings); auto tokenizer = factory.create(); - tokenizer->set_reader(&reader); + tokenizer->set_reader(reader); tokenizer->reset(); return tokenizer; } diff --git a/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp new file mode 100644 index 00000000000000..38b17251b8729f --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/util/reader.h" + +#include + +#include +#include + +#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" +#include "olap/rowset/segment_v2/inverted_index_common.h" +#include "util/slice.h" + +using namespace lucene::analysis; +using namespace doris::segment_v2::inverted_index; + +namespace doris::segment_v2 { + +TEST(ReaderTest, ArrayFieldTokenStreamWorkflow) { + CharFilterMap char_filter_map; + char_filter_map["char_filter_type"] = "char_replace"; + char_filter_map["char_filter_pattern"] = ","; + char_filter_map["char_filter_replacement"] = " "; + + // 正确创建 InvertedIndexCtx + auto inverted_index_ctx = std::make_shared(); + inverted_index_ctx->custom_analyzer = ""; + inverted_index_ctx->parser_type = InvertedIndexParserType::PARSER_STANDARD; + inverted_index_ctx->parser_mode = "standard"; + inverted_index_ctx->support_phrase = "yes"; + inverted_index_ctx->char_filter_map = char_filter_map; + inverted_index_ctx->lower_case = "true"; + inverted_index_ctx->stop_words = ""; + + auto analyzer = InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get()); + ASSERT_NE(analyzer, nullptr); + + std::string test_data = "hello,world,test"; + Slice slice(test_data); + + std::vector keep_readers; + auto dir = std::make_shared(); + { + lucene::index::IndexWriter indexwriter(dir.get(), analyzer.get(), true); + indexwriter.setRAMBufferSizeMB(512); + indexwriter.setMaxFieldLength(0x7FFFFFFFL); + indexwriter.setMergeFactor(1000000000); + indexwriter.setUseCompoundFile(false); + lucene::document::Document doc; + std::unique_ptr new_field; + for (int i = 0; i < 2; i++) { + int32_t field_config = lucene::document::Field::STORE_NO; + field_config |= lucene::document::Field::INDEX_NONORMS; + field_config |= lucene::document::Field::INDEX_TOKENIZED; + auto* field = _CLNEW lucene::document::Field(L"name", field_config); + new_field.reset(field); + { + ReaderPtr char_string_reader = + InvertedIndexAnalyzer::create_reader(inverted_index_ctx->char_filter_map); + char_string_reader->init(slice.get_data(), cast_set(slice.get_size()), + false); + + auto* ts = analyzer->tokenStream(new_field->name(), char_string_reader); + ASSERT_NE(ts, nullptr); + + new_field->setValue(ts, true); + keep_readers.emplace_back(std::move(char_string_reader)); + } + doc.add(*new_field.release()); + } + indexwriter.addDocument(&doc); + indexwriter.close(); + } + dir->close(); +} + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 1ae0cc0e075ced..fa9f119258c2f3 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -126,6 +126,7 @@ CATALOGS: 'CATALOGS'; CHAIN: 'CHAIN'; CHAR: 'CHAR' | 'CHARACTER'; CHARSET: 'CHARSET'; +CHAR_FILTER: 'CHAR_FILTER'; CHECK: 'CHECK'; CLEAN: 'CLEAN'; CLUSTER: 'CLUSTER'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 25c0d9a00abcf9..8a485a9c053e7b 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -242,6 +242,8 @@ supportedCreateStatement name=identifier properties=propertyClause? #createIndexTokenizer | CREATE INVERTED INDEX TOKEN_FILTER (IF NOT EXISTS)? name=identifier properties=propertyClause? #createIndexTokenFilter + | CREATE INVERTED INDEX CHAR_FILTER (IF NOT EXISTS)? + name=identifier properties=propertyClause? #createIndexCharFilter ; dictionaryColumnDefs: @@ -325,6 +327,7 @@ supportedDropStatement | DROP INVERTED INDEX ANALYZER (IF EXISTS)? name=identifier #dropIndexAnalyzer | DROP INVERTED INDEX TOKENIZER (IF EXISTS)? name=identifier #dropIndexTokenizer | DROP INVERTED INDEX TOKEN_FILTER (IF EXISTS)? name=identifier #dropIndexTokenFilter + | DROP INVERTED INDEX CHAR_FILTER (IF EXISTS)? name=identifier #dropIndexCharFilter ; supportedShowStatement @@ -463,6 +466,7 @@ supportedLoadStatement | SHOW INVERTED INDEX ANALYZER #showIndexAnalyzer | SHOW INVERTED INDEX TOKENIZER #showIndexTokenizer | SHOW INVERTED INDEX TOKEN_FILTER #showIndexTokenFilter + | SHOW INVERTED INDEX CHAR_FILTER #showIndexCharFilter ; supportedKillStatement diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasicTokenizerValidator.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasicTokenizerValidator.java new file mode 100644 index 00000000000000..ee33cc34ea42fd --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasicTokenizerValidator.java @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.indexpolicy; + +import org.apache.doris.common.DdlException; + +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.Set; + +public class BasicTokenizerValidator extends BasePolicyValidator { + private static final Set ALLOWED_PROPS = ImmutableSet.of("type", "mode"); + + public BasicTokenizerValidator() { + super(ALLOWED_PROPS); + } + + @Override + protected String getTypeName() { + return "basic tokenizer"; + } + + @Override + protected void validateSpecific(Map props) throws DdlException { + if (props.containsKey("mode")) { + try { + int mode = Integer.parseInt(props.get("mode")); + if (mode < 1 || mode > 2) { + throw new DdlException("Invalid mode for basic tokenizer: " + mode + + ". Mode must be 1 (L1: English + numbers + Chinese) " + + "or 2 (L2: L1 + all Unicode characters)"); + } + } catch (NumberFormatException e) { + throw new DdlException("mode must be a positive integer (1 or 2)"); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharReplaceCharFilterValidator.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharReplaceCharFilterValidator.java new file mode 100644 index 00000000000000..2e7fe15b2a2bea --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharReplaceCharFilterValidator.java @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.indexpolicy; + +import org.apache.doris.common.DdlException; + +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.Set; + +public class CharReplaceCharFilterValidator extends BasePolicyValidator { + private static final Set ALLOWED_PROPS = ImmutableSet.of( + "type", "pattern", "replacement"); + + public CharReplaceCharFilterValidator() { + super(ALLOWED_PROPS); + } + + @Override + protected String getTypeName() { + return "char_replace filter"; + } + + @Override + protected void validateSpecific(Map props) throws DdlException { + if (props.containsKey("pattern")) { + String pattern = props.get("pattern"); + if (pattern != null && !pattern.isEmpty()) { + for (int i = 0; i < pattern.length(); i++) { + if (pattern.charAt(i) > 255) { + throw new DdlException( + "pattern must contain only single-byte characters in [0,255]"); + } + } + } + } + if (props.containsKey("replacement")) { + String replacement = props.get("replacement"); + if (replacement == null || replacement.length() != 1) { + throw new DdlException("replacement must be exactly one byte"); + } + if (replacement.charAt(0) > 255) { + throw new DdlException("replacement must be in [0,255]"); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUTokenizerValidator.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUTokenizerValidator.java new file mode 100644 index 00000000000000..cb7254863572b6 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUTokenizerValidator.java @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.indexpolicy; + +import org.apache.doris.common.DdlException; + +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.Set; + +public class ICUTokenizerValidator extends BasePolicyValidator { + private static final Set ALLOWED_PROPS = ImmutableSet.of("type"); + + public ICUTokenizerValidator() { + super(ALLOWED_PROPS); + } + + @Override + protected String getTypeName() { + return "icu tokenizer"; + } + + @Override + protected void validateSpecific(Map props) throws DdlException { + // ICU tokenizer doesn't have additional parameters to validate + // It uses default ICU configuration + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java index 74fa1c7f8a8d01..8fd02a59292165 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java @@ -56,13 +56,17 @@ public class IndexPolicy implements Writable, GsonPostProcessable { public static final String PROP_ANALYZER = "analyzer"; public static final String PROP_TOKENIZER = "tokenizer"; public static final String PROP_TOKEN_FILTER = "token_filter"; + public static final String PROP_CHAR_FILTER = "char_filter"; public static final Set BUILTIN_TOKENIZERS = ImmutableSet.of( - "ngram", "edge_ngram", "keyword", "standard", "char_group"); + "ngram", "edge_ngram", "keyword", "standard", "char_group", "basic", "icu"); public static final Set BUILTIN_TOKEN_FILTERS = ImmutableSet.of( "asciifolding", "word_delimiter", "lowercase"); + public static final Set BUILTIN_CHAR_FILTERS = ImmutableSet.of( + "char_replace"); + private static final Logger LOG = LogManager.getLogger(IndexPolicy.class); @SerializedName(value = "id") diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java index b92037e953e1aa..d52351f2488aff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java @@ -109,6 +109,9 @@ public void createIndexPolicy(boolean ifNotExists, String policyName, if (IndexPolicy.BUILTIN_TOKEN_FILTERS.contains(policyName)) { throw new DdlException("Policy name '" + policyName + "' conflicts with built-in token filter name"); } + if (IndexPolicy.BUILTIN_CHAR_FILTERS.contains(policyName)) { + throw new DdlException("Policy name '" + policyName + "' conflicts with built-in char filter name"); + } IndexPolicy indexPolicy = IndexPolicy.create(policyName, type, properties); @@ -161,6 +164,9 @@ private void validatePolicyProperties(IndexPolicyTypeEnum type, Map properties) throws DdlException { for (String key : properties.keySet()) { if (!key.equals(IndexPolicy.PROP_TOKENIZER) - && !key.equals(IndexPolicy.PROP_TOKEN_FILTER)) { + && !key.equals(IndexPolicy.PROP_TOKEN_FILTER) + && !key.equals(IndexPolicy.PROP_CHAR_FILTER)) { throw new DdlException("Invalid analyzer property: '" + key + "'. Only '" + IndexPolicy.PROP_TOKENIZER + "' and '" + IndexPolicy.PROP_TOKEN_FILTER - + "' are allowed."); + + "' and '" + IndexPolicy.PROP_CHAR_FILTER + "' are allowed."); } } @@ -188,6 +195,13 @@ private void validateAnalyzerProperties(Map properties) throws D validatePolicyReference(filter, IndexPolicyTypeEnum.TOKEN_FILTER); } } + + String charFilters = properties.get(IndexPolicy.PROP_CHAR_FILTER); + if (charFilters != null && !charFilters.isEmpty()) { + for (String filter : charFilters.split(",\\s*")) { + validatePolicyReference(filter, IndexPolicyTypeEnum.CHAR_FILTER); + } + } } private void validatePolicyReference(String name, IndexPolicyTypeEnum expectedType) @@ -200,6 +214,10 @@ private void validatePolicyReference(String name, IndexPolicyTypeEnum expectedTy && IndexPolicy.BUILTIN_TOKEN_FILTERS.contains(name)) { return; } + if (expectedType == IndexPolicyTypeEnum.CHAR_FILTER + && IndexPolicy.BUILTIN_CHAR_FILTERS.contains(name)) { + return; + } IndexPolicy policy = getPolicyByName(name); if (policy == null) { @@ -263,6 +281,23 @@ private void validateTokenFilterProperties(Map properties) throw validator.validate(properties); } + private void validateCharFilterProperties(Map properties) throws DdlException { + String type = properties.get(IndexPolicy.PROP_TYPE); + if (type == null || type.isEmpty()) { + throw new DdlException("CHAR_FILTER must specify a 'type' property"); + } + PolicyPropertyValidator validator; + switch (type) { + case "char_replace": + validator = new CharReplaceCharFilterValidator(); + break; + default: + throw new DdlException("Unsupported char filter type: " + type + + ". Supported types: " + IndexPolicy.BUILTIN_CHAR_FILTERS); + } + validator.validate(properties); + } + public void dropIndexPolicy(boolean isIfExists, String indexPolicyName, IndexPolicyTypeEnum type) throws DdlException, AnalysisException { writeLock(); @@ -278,7 +313,8 @@ public void dropIndexPolicy(boolean isIfExists, String indexPolicyName, checkAnalyzerNotUsedByIndex(policyToDrop.getName()); } if (policyToDrop.getType() == IndexPolicyTypeEnum.TOKENIZER - || policyToDrop.getType() == IndexPolicyTypeEnum.TOKEN_FILTER) { + || policyToDrop.getType() == IndexPolicyTypeEnum.TOKEN_FILTER + || policyToDrop.getType() == IndexPolicyTypeEnum.CHAR_FILTER) { checkPolicyNotReferenced(policyToDrop); } long id = policyToDrop.getId(); @@ -303,7 +339,8 @@ private void checkAnalyzerNotUsedByIndex(String analyzerName) throws DdlExceptio if (properties != null && analyzerName.equals(properties.get(IndexPolicy.PROP_ANALYZER))) { throw new DdlException("the analyzer " + analyzerName + " is used by index: " - + index.getIndexName() + " in table: " + table.getName()); + + index.getIndexName() + " in table: " + + db.getFullName() + "." + table.getName()); } } } @@ -335,6 +372,17 @@ private void checkPolicyNotReferenced(IndexPolicy policy) throws DdlException { } } } + } else if (policyType == IndexPolicyTypeEnum.CHAR_FILTER) { + String charFilters = properties.get(IndexPolicy.PROP_CHAR_FILTER); + if (charFilters != null && !charFilters.isEmpty()) { + for (String filter : charFilters.split(",\\s*")) { + if (policyName.equals(filter)) { + throw new DdlException("Cannot drop " + policyType + " policy '" + + policyName + "' as it is referenced by ANALYZER policy '" + + analyzerPolicy.getName() + "'"); + } + } + } } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java index 2f146e16884ff3..acda67c9b8c0a9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java @@ -23,13 +23,14 @@ * Index policy type enum. **/ public enum IndexPolicyTypeEnum { - ANALYZER, TOKENIZER, TOKEN_FILTER; + ANALYZER, TOKENIZER, TOKEN_FILTER, CHAR_FILTER; public TIndexPolicyType toThrift() { switch (this) { case ANALYZER: return TIndexPolicyType.ANALYZER; case TOKENIZER: return TIndexPolicyType.TOKENIZER; case TOKEN_FILTER: return TIndexPolicyType.TOKEN_FILTER; + case CHAR_FILTER: return TIndexPolicyType.CHAR_FILTER; default: throw new IllegalStateException("Unknown type: " + this); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 925d1e57c9b869..13e777867150ed 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -151,6 +151,7 @@ import org.apache.doris.nereids.DorisParser.CreateEncryptkeyContext; import org.apache.doris.nereids.DorisParser.CreateFileContext; import org.apache.doris.nereids.DorisParser.CreateIndexAnalyzerContext; +import org.apache.doris.nereids.DorisParser.CreateIndexCharFilterContext; import org.apache.doris.nereids.DorisParser.CreateIndexContext; import org.apache.doris.nereids.DorisParser.CreateIndexTokenFilterContext; import org.apache.doris.nereids.DorisParser.CreateIndexTokenizerContext; @@ -189,6 +190,7 @@ import org.apache.doris.nereids.DorisParser.DropFileContext; import org.apache.doris.nereids.DorisParser.DropFunctionContext; import org.apache.doris.nereids.DorisParser.DropIndexAnalyzerContext; +import org.apache.doris.nereids.DorisParser.DropIndexCharFilterContext; import org.apache.doris.nereids.DorisParser.DropIndexClauseContext; import org.apache.doris.nereids.DorisParser.DropIndexContext; import org.apache.doris.nereids.DorisParser.DropIndexTokenFilterContext; @@ -383,6 +385,7 @@ import org.apache.doris.nereids.DorisParser.ShowGrantsContext; import org.apache.doris.nereids.DorisParser.ShowGrantsForUserContext; import org.apache.doris.nereids.DorisParser.ShowIndexAnalyzerContext; +import org.apache.doris.nereids.DorisParser.ShowIndexCharFilterContext; import org.apache.doris.nereids.DorisParser.ShowIndexTokenFilterContext; import org.apache.doris.nereids.DorisParser.ShowIndexTokenizerContext; import org.apache.doris.nereids.DorisParser.ShowLastInsertContext; @@ -664,6 +667,7 @@ import org.apache.doris.nereids.trees.plans.commands.CreateFileCommand; import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.CreateIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateJobCommand; @@ -699,6 +703,7 @@ import org.apache.doris.nereids.trees.plans.commands.DropFileCommand; import org.apache.doris.nereids.trees.plans.commands.DropFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.DropIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropJobCommand; @@ -798,6 +803,7 @@ import org.apache.doris.nereids.trees.plans.commands.ShowFunctionsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowGrantsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.ShowIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexStatsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenFilterCommand; @@ -9106,6 +9112,15 @@ public LogicalPlan visitCreateIndexTokenFilter(CreateIndexTokenFilterContext ctx return new CreateIndexTokenFilterCommand(ifNotExists, policyName, properties); } + @Override + public LogicalPlan visitCreateIndexCharFilter(CreateIndexCharFilterContext ctx) { + boolean ifNotExists = ctx.IF() != null; + String policyName = ctx.name.getText(); + Map properties = visitPropertyClause(ctx.properties); + + return new CreateIndexCharFilterCommand(ifNotExists, policyName, properties); + } + @Override public LogicalPlan visitDropIndexAnalyzer(DropIndexAnalyzerContext ctx) { String policyName = ctx.name.getText(); @@ -9130,6 +9145,14 @@ public LogicalPlan visitDropIndexTokenFilter(DropIndexTokenFilterContext ctx) { return new DropIndexTokenFilterCommand(policyName, ifExists); } + @Override + public LogicalPlan visitDropIndexCharFilter(DropIndexCharFilterContext ctx) { + String policyName = ctx.name.getText(); + boolean ifExists = ctx.IF() != null; + + return new DropIndexCharFilterCommand(policyName, ifExists); + } + @Override public LogicalPlan visitShowIndexAnalyzer(ShowIndexAnalyzerContext ctx) { return new ShowIndexAnalyzerCommand(); @@ -9145,6 +9168,11 @@ public LogicalPlan visitShowIndexTokenFilter(ShowIndexTokenFilterContext ctx) { return new ShowIndexTokenFilterCommand(); } + @Override + public LogicalPlan visitShowIndexCharFilter(ShowIndexCharFilterContext ctx) { + return new ShowIndexCharFilterCommand(); + } + @Override public AlterTableOp visitCreateOrReplaceBranchClauses(DorisParser.CreateOrReplaceBranchClausesContext ctx) { return visitCreateOrReplaceBranchClause(ctx.createOrReplaceBranchClause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java index f5ace7cd416c21..4274e8ca5b940c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java @@ -456,5 +456,8 @@ public enum PlanType { SHOW_INDEX_TOKENIZER_COMMAND, SHOW_INDEX_TOKEN_FILTER_COMMAND, DROP_MATERIALIZED_VIEW_COMMAND, + CREATE_INDEX_CHAR_FILTER_COMMAND, + DROP_INDEX_CHAR_FILTER_COMMAND, + SHOW_INDEX_CHAR_FILTER_COMMAND, EMPTY_COMMAND } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexCharFilterCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexCharFilterCommand.java new file mode 100644 index 00000000000000..261046d9659ab2 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexCharFilterCommand.java @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.FeNameFormat; +import org.apache.doris.common.UserException; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.StmtExecutor; + +import java.util.Map; + +/** + * CREATE INVERTED INDEX CHAR_FILTER [IF NOT EXISTS] policy_name PROPERTIES (key1 = value1, ...) + */ +public class CreateIndexCharFilterCommand extends Command implements ForwardWithSync { + + private final boolean ifNotExists; + private final String policyName; + private final Map properties; + + /** + * Constructor + */ + public CreateIndexCharFilterCommand(boolean ifNotExists, String policyName, Map properties) { + super(PlanType.CREATE_INDEX_CHAR_FILTER_COMMAND); + this.ifNotExists = ifNotExists; + this.policyName = policyName; + this.properties = properties; + } + + private void validate(ConnectContext ctx) throws UserException { + // check auth + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); + } + + // check name + FeNameFormat.checkIndexPolicyName(policyName); + } + + @Override + public void run(ConnectContext ctx, StmtExecutor executor) throws UserException { + validate(ctx); + + Env.getCurrentEnv().getIndexPolicyMgr().createIndexPolicy(ifNotExists, policyName, + IndexPolicyTypeEnum.CHAR_FILTER, properties); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitCreateIndexCharFilterCommand(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexCharFilterCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexCharFilterCommand.java new file mode 100644 index 00000000000000..62874f54b8c0c7 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexCharFilterCommand.java @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.FeNameFormat; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.StmtExecutor; + +/** + * DROP INVERTED INDEX CHAR_FILTER [IF EXISTS] policy_name + **/ +public class DropIndexCharFilterCommand extends DropCommand { + private final boolean ifExists; + private final String name; + + public DropIndexCharFilterCommand(String name, boolean ifExists) { + super(PlanType.DROP_INDEX_CHAR_FILTER_COMMAND); + this.name = name; + this.ifExists = ifExists; + } + + @Override + public void doRun(ConnectContext ctx, StmtExecutor executor) throws Exception { + // check auth + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); + } + + // check name + FeNameFormat.checkIndexPolicyName(name); + + Env.getCurrentEnv().getIndexPolicyMgr().dropIndexPolicy(ifExists, name, + IndexPolicyTypeEnum.CHAR_FILTER); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitDropIndexCharFilterCommand(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexCharFilterCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexCharFilterCommand.java new file mode 100644 index 00000000000000..8ebb0adfa50b36 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexCharFilterCommand.java @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.indexpolicy.IndexPolicy; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ShowResultSet; +import org.apache.doris.qe.ShowResultSetMetaData; +import org.apache.doris.qe.StmtExecutor; + +/** + * SHOW INVERTED INDEX CHAR_FILTER; + **/ +public class ShowIndexCharFilterCommand extends ShowCommand { + public ShowIndexCharFilterCommand() { + super(PlanType.SHOW_INDEX_CHAR_FILTER_COMMAND); + } + + @Override + public ShowResultSet doRun(ConnectContext ctx, StmtExecutor executor) throws Exception { + // check auth + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); + } + + return Env.getCurrentEnv().getIndexPolicyMgr().showIndexPolicy(IndexPolicyTypeEnum.CHAR_FILTER); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitShowIndexCharFilterCommand(this, context); + } + + @Override + public ShowResultSetMetaData getMetaData() { + return IndexPolicy.INDEX_POLICY_META_DATA; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java index 5f679090ffe770..63072f34efeff5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java @@ -78,6 +78,7 @@ import org.apache.doris.nereids.trees.plans.commands.CreateFileCommand; import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.CreateIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateJobCommand; @@ -112,6 +113,7 @@ import org.apache.doris.nereids.trees.plans.commands.DropFileCommand; import org.apache.doris.nereids.trees.plans.commands.DropFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.DropIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropJobCommand; @@ -211,6 +213,7 @@ import org.apache.doris.nereids.trees.plans.commands.ShowFunctionsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowGrantsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexAnalyzerCommand; +import org.apache.doris.nereids.trees.plans.commands.ShowIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexStatsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenFilterCommand; @@ -1410,6 +1413,11 @@ default R visitCreateIndexAnalyzerCommand( return visitCommand(createIndexAnalyzerCommand, context); } + default R visitCreateIndexCharFilterCommand( + CreateIndexCharFilterCommand createIndexCharFilterCommand, C context) { + return visitCommand(createIndexCharFilterCommand, context); + } + default R visitCreateIndexTokenizerCommand( CreateIndexTokenizerCommand createIndexTokenizerCommand, C context) { return visitCommand(createIndexTokenizerCommand, context); @@ -1425,6 +1433,11 @@ default R visitDropIndexAnalyzerCommand( return visitCommand(dropIndexAnalyzerCommand, context); } + default R visitDropIndexCharFilterCommand( + DropIndexCharFilterCommand dropIndexCharFilterCommand, C context) { + return visitCommand(dropIndexCharFilterCommand, context); + } + default R visitShowCreateStorageVaultCommand(ShowCreateStorageVaultCommand command, C context) { return visitCommand(command, context); } @@ -1444,6 +1457,11 @@ default R visitShowIndexAnalyzerCommand( return visitCommand(showIndexAnalyzerCommand, context); } + default R visitShowIndexCharFilterCommand( + ShowIndexCharFilterCommand showIndexCharFilterCommand, C context) { + return visitCommand(showIndexCharFilterCommand, context); + } + default R visitShowIndexTokenizerCommand( ShowIndexTokenizerCommand showIndexTokenizerCommand, C context) { return visitCommand(showIndexTokenizerCommand, context); diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index a23bf8e81faa3c..bed44748a66312 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -138,7 +138,8 @@ struct TPushStoragePolicyReq { enum TIndexPolicyType { ANALYZER, TOKENIZER, - TOKEN_FILTER + TOKEN_FILTER, + CHAR_FILTER } struct TIndexPolicy { diff --git a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out index 4c22ab33fd210b..687807afbfdc8b 100644 --- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out +++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out @@ -29,6 +29,12 @@ -- !tokenize_sql -- [{\n "token": "1080º"\n }, {\n "token": "avalanche"\n }] +-- !tokenize_sql -- +[{\n "token": "get"\n }, {\n "token": "images"\n }, {\n "token": "hm"\n }, {\n "token": "bg"\n }, {\n "token": "jpg"\n }, {\n "token": "http"\n }, {\n "token": "1"\n }, {\n "token": "0"\n }] + +-- !tokenize_sql -- +[{\n "token": "让"\n }, {\n "token": "我们"\n }, {\n "token": "说"\n }, {\n "token": "hello"\n }, {\n "token": "そして"\n }, {\n "token": "世界"\n }, {\n "token": "と"\n }, {\n "token": "つ"\n }, {\n "token": "な"\n }, {\n "token": "が"\n }, {\n "token": "ろう"\n }] + -- !sql -- 1 abcDEF diff --git a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer1.out b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer1.out index 124a6e37ed1b34..d976dd5b18644f 100644 --- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer1.out +++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer1.out @@ -6,3 +6,9 @@ 1 A two-hour programme which included many forms of [[jazz]] from classic to Latin as well as a mix of jazz from the younger players of the day. 2 with off-peak shows introducing more commercial breaks into their output, before the concept was dropped altogether in mid-2006. +-- !sql -- +1 GET /images/hm_bg.jpg HTTP/1.0 + +-- !sql -- +1 GET /images/hm_bg.jpg HTTP/1.0 + diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy index 9c2315b27eb326..14ee4c6819c143 100644 --- a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy +++ b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy @@ -63,8 +63,26 @@ suite("test_custom_analyzer", "p0") { CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_lowercase PROPERTIES ( - "tokenizer" = "keyword", - "token_filter" = "asciifolding, lowercase" + "tokenizer" = "keyword", + "token_filter" = "asciifolding, lowercase" + ); + """ + + sql """ + CREATE INVERTED INDEX ANALYZER IF NOT EXISTS basic_analyzer + PROPERTIES + ( + "tokenizer" = "basic", + "token_filter" = "lowercase" + ); + """ + + sql """ + CREATE INVERTED INDEX ANALYZER IF NOT EXISTS icu_analyzer + PROPERTIES + ( + "tokenizer" = "icu", + "token_filter" = "lowercase" ); """ @@ -80,6 +98,8 @@ suite("test_custom_analyzer", "p0") { qt_tokenize_sql """ select tokenize("β-carbon nitride", '"analyzer"="lowercase_delimited"'); """ qt_tokenize_sql """ select tokenize("ǁŨǁe language", '"analyzer"="lowercase_delimited"'); """ qt_tokenize_sql """ select tokenize("1080º Avalanche", '"analyzer"="lowercase_delimited"'); """ + qt_tokenize_sql """ select tokenize("GET /images/hm_bg.jpg HTTP/1.0", '"analyzer"="basic_analyzer"'); """ + qt_tokenize_sql """ select tokenize("让我们说「Hello」そして世界とつながろう!", '"analyzer"="icu_analyzer"'); """ sql "DROP TABLE IF EXISTS ${indexTbName1}" sql """ @@ -139,8 +159,6 @@ suite("test_custom_analyzer", "p0") { } } - - try { sql "DROP TABLE IF EXISTS test_custom_analyzer_3" sql """ diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer1.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer1.groovy index 1bede117f4d9c5..665cda3b34660f 100644 --- a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer1.groovy +++ b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer1.groovy @@ -18,8 +18,6 @@ import java.sql.SQLException suite("test_custom_analyzer1", "p0") { - def indexTbName1 = "test_custom_analyzer1" - sql """ CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS word_splitter_all PROPERTIES @@ -34,22 +32,64 @@ suite("test_custom_analyzer1", "p0") { """ sql """ - CREATE INVERTED INDEX ANALYZER IF NOT EXISTS custom_standard_analyzer + CREATE INVERTED INDEX ANALYZER IF NOT EXISTS custom_standard_analyzer1 + PROPERTIES + ( + "tokenizer" = "standard", + "token_filter" = "asciifolding, word_splitter_all, lowercase" + ); + """ + + sql """ + CREATE INVERTED INDEX CHAR_FILTER IF NOT EXISTS char_replace_char_filter1 + PROPERTIES + ( + "type" = "char_replace", + "pattern" = "_" + ); + """ + + sql """ + CREATE INVERTED INDEX CHAR_FILTER IF NOT EXISTS char_replace_char_filter2 + PROPERTIES + ( + "type" = "char_replace", + "pattern" = "." + ); + """ + + sql """ + CREATE INVERTED INDEX ANALYZER IF NOT EXISTS custom_standard_analyzer2 PROPERTIES ( - "tokenizer" = "standard", - "token_filter" = "asciifolding, word_splitter_all, lowercase" + "tokenizer" = "standard", + "char_filter" = "char_replace_char_filter1, char_replace_char_filter2", + "token_filter" = "lowercase" ); """ sql """ select sleep(10) """ - sql "DROP TABLE IF EXISTS ${indexTbName1}" + sql "DROP TABLE IF EXISTS test_custom_analyzer1" + sql """ + CREATE TABLE test_custom_analyzer1 ( + `a` bigint NOT NULL AUTO_INCREMENT(1), + `ch` text NULL, + INDEX idx_ch (`ch`) USING INVERTED PROPERTIES("support_phrase" = "true", "analyzer" = "custom_standard_analyzer1") + ) ENGINE=OLAP + DUPLICATE KEY(`a`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "DROP TABLE IF EXISTS test_custom_analyzer2" sql """ - CREATE TABLE ${indexTbName1} ( + CREATE TABLE test_custom_analyzer2 ( `a` bigint NOT NULL AUTO_INCREMENT(1), `ch` text NULL, - INDEX idx_ch (`ch`) USING INVERTED PROPERTIES("support_phrase" = "true", "analyzer" = "custom_standard_analyzer") + INDEX idx_ch (`ch`) USING INVERTED PROPERTIES("support_phrase" = "true", "analyzer" = "custom_standard_analyzer2") ) ENGINE=OLAP DUPLICATE KEY(`a`) DISTRIBUTED BY RANDOM BUCKETS 1 @@ -58,15 +98,20 @@ suite("test_custom_analyzer1", "p0") { ); """ - sql """ insert into ${indexTbName1} values(1, "A two-hour programme which included many forms of [[jazz]] from classic to Latin as well as a mix of jazz from the younger players of the day."); """ - sql """ insert into ${indexTbName1} values(2, " with off-peak shows introducing more commercial breaks into their output, before the concept was dropped altogether in mid-2006."); """ + sql """ insert into test_custom_analyzer1 values(1, "A two-hour programme which included many forms of [[jazz]] from classic to Latin as well as a mix of jazz from the younger players of the day."); """ + sql """ insert into test_custom_analyzer1 values(2, " with off-peak shows introducing more commercial breaks into their output, before the concept was dropped altogether in mid-2006."); """ + + sql """ insert into test_custom_analyzer2 values(1, "GET /images/hm_bg.jpg HTTP/1.0"); """ try { sql "sync" sql """ set enable_common_expr_pushdown = true; """ - qt_sql """ select * from ${indexTbName1} where ch match 'with'; """ - qt_sql """ select * from ${indexTbName1} where ch match 'the'; """ + qt_sql """ select * from test_custom_analyzer1 where ch match 'with'; """ + qt_sql """ select * from test_custom_analyzer1 where ch match 'the'; """ + + qt_sql """ select * from test_custom_analyzer2 where ch match 'hm'; """ + qt_sql """ select * from test_custom_analyzer2 where ch match 'bg'; """ } finally { } } \ No newline at end of file