diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h index 1ebf47f1d5d5c7..6f313068d4abf9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h @@ -34,10 +34,18 @@ class ConstScoreScorer : public Scorer { float score() override { return _score; } + bool has_null_bitmap(const NullBitmapResolver* resolver = nullptr) override { + return _scorer && _scorer->has_null_bitmap(resolver); + } + + const roaring::Roaring* get_null_bitmap(const NullBitmapResolver* resolver = nullptr) override { + return _scorer ? _scorer->get_null_bitmap(resolver) : nullptr; + } + private: ScorerPtrT _scorer; float _score = 1.0F; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h new file mode 100644 index 00000000000000..204533d2ec5f06 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include + +#include "olap/rowset/segment_v2/index_iterator.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h" +#include "olap/rowset/segment_v2/inverted_index_cache.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +// Small helper that centralizes "field NULL bitmap" lookups so weights/scorers +// don't have to duplicate resolver plumbing. +class FieldNullBitmapFetcher { +public: + FieldNullBitmapFetcher() = delete; + + static std::shared_ptr fetch(const QueryExecutionContext& context, + const std::string& logical_field, + const Scorer* scorer = nullptr) { + return fetch(context.null_resolver, logical_field, scorer); + } + + static std::shared_ptr fetch(const NullBitmapResolver* resolver, + const std::string& logical_field, + const Scorer* scorer = nullptr) { + if (resolver == nullptr || logical_field.empty()) { + return nullptr; + } + + EmptyScorer fallback_scorer; + const Scorer* resolver_scorer = scorer != nullptr ? scorer : &fallback_scorer; + + auto iterator = resolver->iterator_for(*resolver_scorer, logical_field); + if (iterator == nullptr) { + return nullptr; + } + + auto has_null = iterator->has_null(); + if (!has_null.has_value() || !has_null.value()) { + return nullptr; + } + + segment_v2::InvertedIndexQueryCacheHandle cache_handle; + auto status = iterator->read_null_bitmap(&cache_handle); + if (!status.ok()) { + LOG(WARNING) << "Failed to read null bitmap for field '" << logical_field + << "': " << status.to_string(); + return nullptr; + } + + auto bitmap_ptr = cache_handle.get_bitmap(); + if (bitmap_ptr == nullptr) { + return nullptr; + } + + return std::make_shared(*bitmap_ptr); + } +}; + +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h index 133cf71afe4b62..f46ac1793b758a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h @@ -51,4 +51,4 @@ class PhraseQuery : public Query { std::vector _terms; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h index 7fe68e33995af8..339b4fdc99f766 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h @@ -18,6 +18,9 @@ #pragma once #include "olap/rowset/segment_v2/index_query_context.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h" @@ -37,12 +40,29 @@ class PhraseWeight : public Weight { ~PhraseWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string& binding_key) override { - auto scorer = phrase_scorer(ctx, binding_key); - if (scorer) { - return scorer; - } else { - return std::make_shared(); + auto phrase = phrase_scorer(ctx, binding_key); + auto logical_field = logical_field_or_fallback(ctx, binding_key, _field); + auto null_bitmap = FieldNullBitmapFetcher::fetch(ctx, logical_field); + + auto doc_bitset = std::make_shared(); + if (phrase) { + uint32_t doc = phrase->doc(); + if (doc == TERMINATED) { + doc = phrase->advance(); + } + while (doc != TERMINATED) { + doc_bitset->add(doc); + doc = phrase->advance(); + } + } + + auto bit_set = + std::make_shared(std::move(doc_bitset), std::move(null_bitmap)); + if (!phrase) { + return bit_set; } + // Wrap with const score for consistency with other non-scoring paths + return std::make_shared>(std::move(bit_set)); } private: @@ -78,4 +98,4 @@ class PhraseWeight : public Weight { bool _enable_scoring = false; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h index 9f1e7491b50eea..cce83b6e1e77e7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h @@ -43,4 +43,4 @@ class RegexpQuery : public Query { std::string _pattern; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp index 5404abaddb0a4d..f70b5be77c4734 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp @@ -26,8 +26,10 @@ #include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" +#include "olap/rowset/segment_v2/inverted_index_iterator.h" CL_NS_USE(index) @@ -44,6 +46,9 @@ RegexpWeight::RegexpWeight(IndexQueryContextPtr context, std::wstring field, std ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, const std::string& binding_key) { + auto logical_field = logical_field_or_fallback(context, binding_key, _field); + VLOG_DEBUG << "RegexpWeight::scorer() called - pattern=" << _pattern << ", logical_field='" + << logical_field << "'"; auto prefix = get_regex_prefix(_pattern); hs_database_t* database = nullptr; @@ -76,7 +81,10 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, hs_free_database(database); if (matching_terms.empty()) { - return std::make_shared(); + // Even when there are no matching terms, we must honor NULL semantics for the field. + auto empty_true = std::make_shared(); + auto null_bitmap = FieldNullBitmapFetcher::fetch(context, logical_field); + return std::make_shared(std::move(empty_true), std::move(null_bitmap)); } auto doc_bitset = std::make_shared(); @@ -93,7 +101,8 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, } } - auto bit_set = std::make_shared(doc_bitset); + auto null_bitmap = FieldNullBitmapFetcher::fetch(context, logical_field); + auto bit_set = std::make_shared(doc_bitset, null_bitmap); auto const_score = std::make_shared>(std::move(bit_set)); return const_score; } @@ -220,4 +229,4 @@ void RegexpWeight::collect_matching_terms(const QueryExecutionContext& context, } } -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h index e4370a6a14d9b7..7b8fa2eeacedd3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h @@ -25,12 +25,8 @@ namespace doris::segment_v2::inverted_index::query_v2 { class TermQuery : public Query { public: - TermQuery(IndexQueryContextPtr context, std::wstring field, std::wstring term, - std::string logical_field = {}) - : _context(std::move(context)), - _field(std::move(field)), - _term(std::move(term)), - _logical_field(std::move(logical_field)) {} + TermQuery(IndexQueryContextPtr context, std::wstring field, std::wstring term) + : _context(std::move(context)), _field(std::move(field)), _term(std::move(term)) {} ~TermQuery() override = default; WeightPtr weight(bool enable_scoring) override { @@ -43,7 +39,7 @@ class TermQuery : public Query { } return std::make_shared(std::move(_context), std::move(_field), std::move(_term), std::move(bm25_similarity), - enable_scoring, _logical_field); + enable_scoring); } private: @@ -51,7 +47,6 @@ class TermQuery : public Query { std::wstring _field; std::wstring _term; - std::string _logical_field; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h index 77bbc922b1b835..9099a71877dea7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h @@ -22,6 +22,7 @@ #include #include +#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" #include "olap/rowset/segment_v2/inverted_index/similarity/similarity.h" @@ -70,27 +71,9 @@ class TermScorer final : public Scorer { _null_bitmap_checked = true; - auto iterator = resolver->iterator_for(*this, _logical_field); - if (iterator == nullptr) { - return; - } - - auto has_null_result = iterator->has_null(); - if (!has_null_result.has_value() || !has_null_result.value()) { - return; - } - - segment_v2::InvertedIndexQueryCacheHandle cache_handle; - auto status = iterator->read_null_bitmap(&cache_handle); - if (!status.ok()) { - LOG(WARNING) << "TermScorer failed to read null bitmap for field '" << _logical_field - << "': " << status.to_string(); - return; - } - - auto bitmap_ptr = cache_handle.get_bitmap(); - if (bitmap_ptr != nullptr) { - _null_bitmap = *bitmap_ptr; + auto bitmap = FieldNullBitmapFetcher::fetch(resolver, _logical_field, this); + if (bitmap != nullptr) { + _null_bitmap = *bitmap; } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h index 7f0e329d88e925..d532e9664cb9a9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h @@ -27,23 +27,21 @@ namespace doris::segment_v2::inverted_index::query_v2 { class TermWeight : public Weight { public: TermWeight(IndexQueryContextPtr context, std::wstring field, std::wstring term, - SimilarityPtr similarity, bool enable_scoring, std::string logical_field = {}) + SimilarityPtr similarity, bool enable_scoring) : _context(std::move(context)), _field(std::move(field)), _term(std::move(term)), _similarity(std::move(similarity)), - _enable_scoring(enable_scoring), - _logical_field(std::move(logical_field)) {} + _enable_scoring(enable_scoring) {} ~TermWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string& binding_key) override { auto reader = lookup_reader(_field, ctx, binding_key); - auto field_name = - _logical_field.empty() ? std::string(_field.begin(), _field.end()) : _logical_field; + auto logical_field = logical_field_or_fallback(ctx, binding_key, _field); auto make_scorer = [&](auto segment_postings) -> ScorerPtr { using PostingsT = decltype(segment_postings); return std::make_shared>(std::move(segment_postings), _similarity, - field_name); + logical_field); }; if (!reader) { @@ -76,7 +74,6 @@ class TermWeight : public Weight { std::wstring _term; SimilarityPtr _similarity; bool _enable_scoring = false; - std::string _logical_field; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h index c3483128912c92..17d8d11cbc7def 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h @@ -30,12 +30,19 @@ class IndexReader; namespace doris::segment_v2::inverted_index::query_v2 { +struct FieldBindingContext { + std::string logical_field_name; + std::string stored_field_name; + std::wstring stored_field_wstr; +}; + struct QueryExecutionContext { uint32_t segment_num_rows = 0; std::vector> readers; std::unordered_map> reader_bindings; std::unordered_map> field_reader_bindings; + std::unordered_map binding_fields; const NullBitmapResolver* null_resolver = nullptr; }; @@ -52,6 +59,30 @@ class Weight { } protected: + const FieldBindingContext* get_field_binding(const QueryExecutionContext& ctx, + const std::string& binding_key) const { + auto it = ctx.binding_fields.find(binding_key); + if (it != ctx.binding_fields.end()) { + return &it->second; + } + return nullptr; + } + + std::string logical_field_or_fallback(const QueryExecutionContext& ctx, + const std::string& binding_key, + const std::wstring& fallback) const { + const auto* binding = get_field_binding(ctx, binding_key); + if (binding != nullptr) { + if (!binding->logical_field_name.empty()) { + return binding->logical_field_name; + } + if (!binding->stored_field_name.empty()) { + return binding->stored_field_name; + } + } + return std::string(fallback.begin(), fallback.end()); + } + std::shared_ptr lookup_reader( const std::wstring& field, const QueryExecutionContext& ctx, const std::string& binding_key) const { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h index 8cd92418a00ea0..8b71ab9c0d4327 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h @@ -45,4 +45,4 @@ class WildcardQuery : public Query { std::string _pattern; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h index da2de84eae30c7..b906605db296b7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h @@ -47,7 +47,10 @@ class WildcardWeight : public Weight { private: std::string wildcard_to_regex(const std::string& pattern) { std::string escaped = RE2::QuoteMeta(pattern); + // Replace wildcard characters with regex equivalents + // * -> .* (zero or more of any character) escaped = std::regex_replace(escaped, std::regex(R"(\\\*)"), ".*"); + // ? -> . (exactly one of any character) escaped = std::regex_replace(escaped, std::regex(R"(\\\?)"), "."); return "^" + escaped + "$"; } @@ -59,4 +62,4 @@ class WildcardWeight : public Weight { bool _enable_scoring = false; }; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/vec/functions/function_search.cpp b/be/src/vec/functions/function_search.cpp index 19ec3a336128ab..95e0f868a486b7 100644 --- a/be/src/vec/functions/function_search.cpp +++ b/be/src/vec/functions/function_search.cpp @@ -38,7 +38,10 @@ #include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/operator.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h" #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" #include "olap/rowset/segment_v2/inverted_index_iterator.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" @@ -225,6 +228,16 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( exec_ctx.readers = resolver.readers(); exec_ctx.reader_bindings = resolver.reader_bindings(); exec_ctx.field_reader_bindings = resolver.field_readers(); + for (const auto& [binding_key, binding] : resolver.binding_cache()) { + if (binding_key.empty()) { + continue; + } + query_v2::FieldBindingContext binding_ctx; + binding_ctx.logical_field_name = binding.logical_field_name; + binding_ctx.stored_field_name = binding.stored_field_name; + binding_ctx.stored_field_wstr = binding.stored_field_wstr; + exec_ctx.binding_fields.emplace(binding_key, std::move(binding_ctx)); + } class ResolverAdapter final : public query_v2::NullBitmapResolver { public: @@ -253,7 +266,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } - auto scorer = weight->scorer(exec_ctx); + auto scorer = weight->scorer(exec_ctx, root_binding_key); if (!scorer) { LOG(WARNING) << "search: Failed to build scorer"; bitmap_result = InvertedIndexResultBitmap(std::make_shared(), @@ -478,7 +491,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring value_wstr = StringHelper::to_wstring(value); auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr { - return std::make_shared(context, field_wstr, term, field_name); + return std::make_shared(context, field_wstr, term); }; if (clause_type == "TERM") { @@ -497,7 +510,9 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, value, binding.index_properties); if (term_infos.empty()) { LOG(WARNING) << "search: No terms found after tokenization for TERM query, field=" - << field_name << ", value='" << value << "'"; + << field_name << ", value='" << value + << "', returning empty BitSetQuery"; + *out = std::make_shared(roaring::Roaring()); return Status::OK(); } @@ -523,8 +538,47 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) { if (clause_type == "PHRASE") { - VLOG_DEBUG << "search: PHRASE clause not implemented, fallback to TERM"; - *out = make_term_query(value_wstr); + bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( + binding.index_properties); + if (!should_analyze) { + VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name + << "', falling back to TERM"; + *out = make_term_query(value_wstr); + return Status::OK(); + } + + if (binding.index_properties.empty()) { + LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE " + "query on field '" + << field_name << "'"; + *out = make_term_query(value_wstr); + return Status::OK(); + } + + std::vector term_infos = + inverted_index::InvertedIndexAnalyzer::get_analyse_result( + value, binding.index_properties); + if (term_infos.empty()) { + LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field=" + << field_name << ", value='" << value + << "', returning empty BitSetQuery"; + *out = std::make_shared(roaring::Roaring()); + return Status::OK(); + } + + if (term_infos.size() == 1) { + std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); + *out = make_term_query(term_wstr); + return Status::OK(); + } + + std::vector terms; + for (const auto& term_info : term_infos) { + terms.push_back(StringHelper::to_wstring(term_info.get_single_term())); + } + *out = std::make_shared(context, field_wstr, terms); + VLOG_DEBUG << "search: Built PhraseQuery for field=" << field_name << " with " + << terms.size() << " terms"; return Status::OK(); } if (clause_type == "MATCH") { @@ -553,7 +607,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, value, binding.index_properties); if (term_infos.empty()) { LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type - << "', field=" << field_name; + << "', field=" << field_name << ", returning empty BitSetQuery"; + *out = std::make_shared(roaring::Roaring()); return Status::OK(); } @@ -593,9 +648,28 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, << value << "'"; return Status::OK(); } + if (clause_type == "PREFIX") { + *out = std::make_shared(context, field_wstr, value); + VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" + << value << "'"; + return Status::OK(); + } + + if (clause_type == "WILDCARD") { + *out = std::make_shared(context, field_wstr, value); + VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" + << value << "'"; + return Status::OK(); + } + + if (clause_type == "REGEXP") { + *out = std::make_shared(context, field_wstr, value); + VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" + << value << "'"; + return Status::OK(); + } - if (clause_type == "PREFIX" || clause_type == "WILDCARD" || clause_type == "REGEXP" || - clause_type == "RANGE" || clause_type == "LIST") { + if (clause_type == "RANGE" || clause_type == "LIST") { VLOG_DEBUG << "search: clause type '" << clause_type << "' not implemented, fallback to TERM"; } diff --git a/be/src/vec/functions/function_search.h b/be/src/vec/functions/function_search.h index 910a8e25936baa..96e93220f4477f 100644 --- a/be/src/vec/functions/function_search.h +++ b/be/src/vec/functions/function_search.h @@ -93,6 +93,10 @@ class FieldReaderResolver { return _field_readers; } + const std::unordered_map& binding_cache() const { + return _cache; + } + IndexIterator* get_iterator(const std::string& field_name) const { auto it = _iterators.find(field_name); return (it != _iterators.end()) ? it->second : nullptr; diff --git a/regression-test/data/search/test_search_default_field_operator.out b/regression-test/data/search/test_search_default_field_operator.out new file mode 100644 index 00000000000000..c418209e4ea231 --- /dev/null +++ b/regression-test/data/search/test_search_default_field_operator.out @@ -0,0 +1,79 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !wildcard_prefix -- +1 Chris +2 Christopher + +-- !multi_term_and -- +1 foo bar +3 bar foo + +-- !multi_term_or -- +1 foo bar +3 bar foo +4 foolish bark + +-- !wildcard_multi_and -- +1 foo bar +3 bar foo +4 foolish bark + +-- !explicit_or_override -- +1 foo bar +3 bar foo +4 foolish bark + +-- !exact_function -- +1 foo bar + +-- !traditional_syntax -- +1 Chris +2 Christopher + +-- !single_term -- +1 foo bar +3 bar foo + +-- !wildcard_middle -- +1 Chris +2 Christopher + +-- !case_sensitive -- + +-- !default_or -- +1 foo bar +3 bar foo +4 foolish bark + +-- !any_function -- +1 foo bar +3 bar foo +4 foolish bark + +-- !all_function -- +1 foo bar +3 bar foo + +-- !complex_wildcard -- +3 Kevin +4 kevin + +-- !explicit_and -- +1 foo bar +3 bar foo + +-- !multiple_fields -- +1 Chris foo bar +2 Christopher foobar +4 kevin foolish bark + +-- !not_operator -- +1 foo bar +3 bar foo +4 foolish bark + +-- !param_count_mix -- +1 +2 +3 +4 + diff --git a/regression-test/data/search/test_search_dsl_syntax.out b/regression-test/data/search/test_search_dsl_syntax.out index b1f1ba93d4ef96..e8f6c627121143 100644 --- a/regression-test/data/search/test_search_dsl_syntax.out +++ b/regression-test/data/search/test_search_dsl_syntax.out @@ -4,10 +4,15 @@ -- !sql -- -- !sql -- +2 Advanced Deep Learning -- !sql -- +4 Data Science with R +6 Database Design Patterns -- !sql -- +4 Data Science with R +6 Database Design Patterns -- !sql -- 1 Machine Learning Introduction @@ -175,7 +180,6 @@ 14 Test with null tags 15 Test with null author 16 Test with null status -17 Message about success 18 Error message details 19 Warning message content 20 Regular article without msg diff --git a/regression-test/data/search/test_search_function.out b/regression-test/data/search/test_search_function.out index f418a8c71eb960..b24b86768313fd 100644 --- a/regression-test/data/search/test_search_function.out +++ b/regression-test/data/search/test_search_function.out @@ -22,6 +22,8 @@ -- !sql -- -- !sql -- +4 Data Science Methods +9 Database Systems -- !sql -- 2 Deep Learning Tutorial @@ -33,6 +35,7 @@ -- !sql -- -- !sql -- +1 Machine Learning Basics -- !sql -- 0 diff --git a/regression-test/data/search/test_search_null_semantics.out b/regression-test/data/search/test_search_null_semantics.out index 27eddd437befdb..237bd14bc1b02f 100644 --- a/regression-test/data/search/test_search_null_semantics.out +++ b/regression-test/data/search/test_search_null_semantics.out @@ -11,6 +11,15 @@ -- !test_case_2_external_not -- 4 +-- !test_case_2_phrase_not -- +1 +2 +3 +5 +7 +9 +10 + -- !test_case_3_or_with_null -- 1 Ronald Reagan President of the United States 3 \N Biography of Ronald McDonald @@ -148,4 +157,3 @@ -- !ternary_7_all_null -- 0 - diff --git a/regression-test/suites/search/test_search_default_field_operator.groovy b/regression-test/suites/search/test_search_default_field_operator.groovy new file mode 100644 index 00000000000000..fd5c7ce6198fc7 --- /dev/null +++ b/regression-test/suites/search/test_search_default_field_operator.groovy @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_search_default_field_operator") { + def tableName = "search_enhanced_test" + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with inverted indexes + // firstname: with lower_case for case-insensitive wildcard search + // tags: with parser for tokenized search + // tags_exact: without parser specification (default behavior) for exact matching + sql """ + CREATE TABLE ${tableName} ( + id INT, + firstname VARCHAR(100), + tags VARCHAR(200), + tags_exact VARCHAR(200), + INDEX idx_firstname(firstname) USING INVERTED PROPERTIES("lower_case" = "true"), + INDEX idx_tags(tags) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_tags_exact(tags_exact) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // Insert test data matching the image requirements + sql """INSERT INTO ${tableName} VALUES + (1, 'Chris', 'foo bar', 'foo bar'), + (2, 'Christopher', 'foobar', 'foobar'), + (3, 'Kevin', 'bar foo', 'bar foo'), + (4, 'kevin', 'foolish bark', 'foolish bark') + """ + + // Wait for index building + Thread.sleep(3000) + + // ============ Test 1: Wildcard Prefix with Default Field ============ + // Requirement: firstname EQ Chris* + // SQL: search('Chris*', 'firstname') + // Expected: Chris (1), Christopher (2) + // Note: Without parser, inverted index is case-sensitive + qt_wildcard_prefix """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('Chris*', 'firstname') + ORDER BY id + """ + + // ============ Test 2: Multi-term AND with Default Operator ============ + // Requirement: tags EQ foo bar (with AND semantics) + // SQL: search('foo bar', 'tags', 'and') + // Expected: 'foo bar' (1), 'bar foo' (3) + qt_multi_term_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo bar', 'tags', 'and') + ORDER BY id + """ + + // ============ Test 3: Multi-term OR with Default Operator ============ + // Requirement: tags EQ foo OR bark (with OR semantics) + // SQL: search('foo bark', 'tags', 'or') + // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4) + qt_multi_term_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo bark', 'tags', 'or') + ORDER BY id + """ + + // ============ Test 4: Multi-wildcard AND ============ + // Requirement: tags EQ foo* bar* (with AND semantics) + // SQL: search('foo* bar*', 'tags', 'and') + // Expands to: tags:foo* AND tags:bar* + // Expected: rows with tokens matching foo* AND tokens matching bar* + // - 'foo bar' (1): tokens=['foo','bar'] - matches foo* ✓ and bar* ✓ + // - 'foobar' (2): tokens=['foobar'] - matches foo* ✓ but NOT bar* ✗ (excluded) + // - 'bar foo' (3): tokens=['bar','foo'] - matches foo* ✓ and bar* ✓ + // - 'foolish bark' (4): tokens=['foolish','bark'] - matches foo* ✓ and bar* ✓ + qt_wildcard_multi_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo* bar*', 'tags', 'and') + ORDER BY id + """ + + // ============ Test 5: Explicit OR operator overrides default ============ + // SQL: search('foo OR bark', 'tags', 'and') + // The explicit OR in DSL should override the default 'and' operator + // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4) + qt_explicit_or_override """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo OR bark', 'tags', 'and') + ORDER BY id + """ + + // ============ Test 6: EXACT function with default field ============ + // Requirement: EXACT(foo bar) on tags_exact field (no tokenization) + // SQL: search('EXACT(foo bar)', 'tags_exact') + // Expected: 'foo bar' (1) only - exact string match + qt_exact_function """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags_exact + FROM ${tableName} + WHERE search('EXACT(foo bar)', 'tags_exact') + ORDER BY id + """ + + // ============ Test 7: Traditional syntax still works ============ + // Ensure backward compatibility - original syntax unchanged + qt_traditional_syntax """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('firstname:Chris*') + ORDER BY id + """ + + // ============ Test 8: Single term with default field ============ + qt_single_term """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('bar', 'tags') + ORDER BY id + """ + + // ============ Test 9: Wildcard in middle ============ + qt_wildcard_middle """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('*ris*', 'firstname') + ORDER BY id + """ + + // ============ Test 10: Case sensitivity for wildcard ============ + // Without parser, wildcard queries are case-sensitive (matches Lucene behavior) + // CHRIS* won't match Chris/Christopher + qt_case_sensitive """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('CHRIS*', 'firstname') + ORDER BY id + """ + + // ============ Test 11: Default operator is OR when not specified ============ + qt_default_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo bark', 'tags') + ORDER BY id + """ + + // ============ Test 12: ANY function with default field ============ + qt_any_function """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('ANY(foo bark)', 'tags') + ORDER BY id + """ + + // ============ Test 13: ALL function with default field ============ + qt_all_function """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('ALL(foo bar)', 'tags') + ORDER BY id + """ + + // ============ Test 14: Complex wildcard pattern ============ + qt_complex_wildcard """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('?evin', 'firstname') + ORDER BY id + """ + + // ============ Test 15: Default field with explicit AND ============ + qt_explicit_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('foo AND bar', 'tags') + ORDER BY id + """ + + // ============ Test 16: Multiple fields still work ============ + qt_multiple_fields """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname, tags + FROM ${tableName} + WHERE search('firstname:Chris* OR tags:bark') + ORDER BY id + """ + + // ============ Test 17: NOT operator with default field ============ + qt_not_operator """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags + FROM ${tableName} + WHERE search('NOT foobar', 'tags') + ORDER BY id + """ + + // ============ Test 18: Combining different parameter counts ============ + // Tests mixing 1-param, 2-param, and 3-param search() calls in same query + // - search('firstname:Chris*'): 1-param, traditional syntax → matches id 1,2 + // - search('foo*', 'tags', 'or'): 3-param with wildcard → matches id 1,3,4 + // - OR combination → matches id 1,2,3,4 (all rows) + qt_param_count_mix """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id + FROM ${tableName} + WHERE search('firstname:Chris*') OR search('foo*', 'tags', 'or') + ORDER BY id + """ + + // Cleanup + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_null_semantics.groovy b/regression-test/suites/search/test_search_null_semantics.groovy index 269a27056cf8d9..c7d97c18bdc31c 100644 --- a/regression-test/suites/search/test_search_null_semantics.groovy +++ b/regression-test/suites/search/test_search_null_semantics.groovy @@ -81,6 +81,13 @@ suite("test_search_null_semantics") { WHERE not search('content:Round') """ + // Test Case 2b: Phrase NOT queries must treat NULL rows as UNKNOWN + qt_test_case_2_phrase_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} + WHERE NOT search('content:"Selma Blair"') + ORDER BY id + """ + // Test Case 3: NULL handling in OR queries // Verify that NULL OR TRUE = TRUE logic works qt_test_case_3_or_with_null """