From 96e1bb93e6405b73936c6515784994ee37778c95 Mon Sep 17 00:00:00 2001 From: yangzq50 <58433399+yangzq50@users.noreply.github.com> Date: Mon, 4 Nov 2024 20:00:19 +0800 Subject: [PATCH] Support keyword analyzer (#2168) ### What problem does this PR solve? Support keyword analyzer Support boolean similarity for columns with keyword analyzer Issue link:#2139 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring - [x] Test cases --- src/common/analyzer/analyzer_pool.cpp | 4 + src/common/analyzer/analyzer_pool.cppm | 1 + src/common/analyzer/keyword_analyzer.cpp | 37 ++++ src/common/analyzer/keyword_analyzer.cppm | 32 ++++ src/parser/search_parser.cpp | 26 +-- src/parser/search_parser.y | 26 +-- .../invertedindex/search/doc_iterator.cppm | 1 + .../search/keyword_iterator.cppm | 71 +++++++ .../invertedindex/search/query_node.cpp | 137 ++++++++------ src/storage/invertedindex/search/query_node.h | 12 +- .../invertedindex/search/search_driver.cpp | 177 +++++++++--------- .../invertedindex/search/search_driver.h | 5 +- test/sql/dql/fulltext/fulltext_keyword.slt | 44 +++++ 13 files changed, 397 insertions(+), 176 deletions(-) create mode 100644 src/common/analyzer/keyword_analyzer.cpp create mode 100644 src/common/analyzer/keyword_analyzer.cppm create mode 100644 src/storage/invertedindex/search/keyword_iterator.cppm create mode 100644 test/sql/dql/fulltext/fulltext_keyword.slt diff --git a/src/common/analyzer/analyzer_pool.cpp b/src/common/analyzer/analyzer_pool.cpp index 64bcbf5cb8..c8b60cc73f 100644 --- a/src/common/analyzer/analyzer_pool.cpp +++ b/src/common/analyzer/analyzer_pool.cpp @@ -31,6 +31,7 @@ import korean_analyzer; import standard_analyzer; import ngram_analyzer; import rag_analyzer; +import keyword_analyzer; import logger; namespace infinity { @@ -267,6 +268,9 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v } return {MakeUnique(ngram), Status::OK()}; } + case Str2Int(KEYWORD.data()): { + return {MakeUnique(), Status::OK()}; + } default: { if(std::filesystem::is_regular_file(name)) { // Suppose it is a customized Python script analyzer diff --git a/src/common/analyzer/analyzer_pool.cppm b/src/common/analyzer/analyzer_pool.cppm index a5171c60cc..6a7aa61a8f 100644 --- a/src/common/analyzer/analyzer_pool.cppm +++ b/src/common/analyzer/analyzer_pool.cppm @@ -40,6 +40,7 @@ public: static constexpr std::string_view STANDARD = "standard"; static constexpr std::string_view NGRAM = "ngram"; static constexpr std::string_view RAG = "rag"; + static constexpr std::string_view KEYWORD = "keyword"; private: CacheType cache_{}; diff --git a/src/common/analyzer/keyword_analyzer.cpp b/src/common/analyzer/keyword_analyzer.cpp new file mode 100644 index 0000000000..9e539c44cc --- /dev/null +++ b/src/common/analyzer/keyword_analyzer.cpp @@ -0,0 +1,37 @@ +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +#include +#include +module keyword_analyzer; + +import stl; +import term; +import analyzer; + +namespace infinity { + +int KeywordAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { + std::istringstream is(input.text_); + std::string t; + u32 offset = 0; + while (is >> t) { + func(data, t.data(), t.size(), offset++, 0, Term::AND, 0, false); + } + return 0; +} + +} // namespace infinity diff --git a/src/common/analyzer/keyword_analyzer.cppm b/src/common/analyzer/keyword_analyzer.cppm new file mode 100644 index 0000000000..bddf389714 --- /dev/null +++ b/src/common/analyzer/keyword_analyzer.cppm @@ -0,0 +1,32 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +export module keyword_analyzer; +import stl; +import term; +import analyzer; + +namespace infinity { +export class KeywordAnalyzer : public Analyzer { +public: + KeywordAnalyzer() = default; + ~KeywordAnalyzer() override = default; + +protected: + int AnalyzeImpl(const Term &input, void *data, HookType func) override; +}; + +} // namespace infinity diff --git a/src/parser/search_parser.cpp b/src/parser/search_parser.cpp index 6f17aa61bb..9551dec201 100644 --- a/src/parser/search_parser.cpp +++ b/src/parser/search_parser.cpp @@ -807,11 +807,11 @@ namespace infinity { case 4: // query: query clause #line 91 "search_parser.y" { - auto query = driver.GetMultiQueryNodeByOperatorOption(); - auto *multi_query_ptr = dynamic_cast(query.get()); - multi_query_ptr->Add(std::move(yystack_[1].value.as < std::unique_ptr > ())); - multi_query_ptr->Add(std::move(yystack_[0].value.as < std::unique_ptr > ())); - yylhs.value.as < std::unique_ptr > () = std::move(query); + assert(driver.operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax); + auto q = std::make_unique(); + q->Add(std::move(yystack_[1].value.as < std::unique_ptr > ())); + q->Add(std::move(yystack_[0].value.as < std::unique_ptr > ())); + yylhs.value.as < std::unique_ptr > () = std::move(q); } #line 817 "search_parser.cpp" break; @@ -819,10 +819,10 @@ namespace infinity { case 5: // query: query OR clause #line 98 "search_parser.y" { - auto query = std::make_unique(); - query->Add(std::move(yystack_[2].value.as < std::unique_ptr > ())); - query->Add(std::move(yystack_[0].value.as < std::unique_ptr > ())); - yylhs.value.as < std::unique_ptr > () = std::move(query); + auto q = std::make_unique(); + q->Add(std::move(yystack_[2].value.as < std::unique_ptr > ())); + q->Add(std::move(yystack_[0].value.as < std::unique_ptr > ())); + yylhs.value.as < std::unique_ptr > () = std::move(q); } #line 828 "search_parser.cpp" break; @@ -901,7 +901,7 @@ namespace infinity { YYERROR; } std::string text = SearchDriver::Unescape(yystack_[0].value.as < InfString > ().text_); - yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, std::move(text), yystack_[0].value.as < InfString > ().from_quoted_); + yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, text, yystack_[0].value.as < InfString > ().from_quoted_); } #line 907 "search_parser.cpp" break; @@ -911,7 +911,7 @@ namespace infinity { { std::string field = SearchDriver::Unescape(yystack_[2].value.as < InfString > ().text_); std::string text = SearchDriver::Unescape(yystack_[0].value.as < InfString > ().text_); - yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(std::move(field), std::move(text), yystack_[0].value.as < InfString > ().from_quoted_); + yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, text, yystack_[0].value.as < InfString > ().from_quoted_); } #line 917 "search_parser.cpp" break; @@ -925,7 +925,7 @@ namespace infinity { YYERROR; } std::string text = SearchDriver::Unescape(yystack_[1].value.as < InfString > ().text_); - yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, std::move(text), yystack_[1].value.as < InfString > ().from_quoted_, yystack_[0].value.as < unsigned long > ()); + yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, text, yystack_[1].value.as < InfString > ().from_quoted_, yystack_[0].value.as < unsigned long > ()); } #line 931 "search_parser.cpp" break; @@ -935,7 +935,7 @@ namespace infinity { { std::string field = SearchDriver::Unescape(yystack_[3].value.as < InfString > ().text_); std::string text = SearchDriver::Unescape(yystack_[1].value.as < InfString > ().text_); - yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(std::move(field), std::move(text), yystack_[1].value.as < InfString > ().from_quoted_, yystack_[0].value.as < unsigned long > ()); + yylhs.value.as < std::unique_ptr > () = driver.AnalyzeAndBuildQueryNode(field, text, yystack_[1].value.as < InfString > ().from_quoted_, yystack_[0].value.as < unsigned long > ()); } #line 941 "search_parser.cpp" break; diff --git a/src/parser/search_parser.y b/src/parser/search_parser.y index 09dd54cf2f..3246bf80b1 100644 --- a/src/parser/search_parser.y +++ b/src/parser/search_parser.y @@ -89,17 +89,17 @@ topLevelQuery query : clause { $$ = std::move($1); } | query clause { - auto query = driver.GetMultiQueryNodeByOperatorOption(); - auto *multi_query_ptr = dynamic_cast(query.get()); - multi_query_ptr->Add(std::move($1)); - multi_query_ptr->Add(std::move($2)); - $$ = std::move(query); + assert(driver.operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax); + auto q = std::make_unique(); + q->Add(std::move($1)); + q->Add(std::move($2)); + $$ = std::move(q); } | query OR clause { - auto query = std::make_unique(); - query->Add(std::move($1)); - query->Add(std::move($3)); - $$ = std::move(query); + auto q = std::make_unique(); + q->Add(std::move($1)); + q->Add(std::move($3)); + $$ = std::move(q); }; clause @@ -141,12 +141,12 @@ basic_filter YYERROR; } std::string text = SearchDriver::Unescape($1.text_); - $$ = driver.AnalyzeAndBuildQueryNode(field, std::move(text), $1.from_quoted_); + $$ = driver.AnalyzeAndBuildQueryNode(field, text, $1.from_quoted_); } | STRING OP_COLON STRING { std::string field = SearchDriver::Unescape($1.text_); std::string text = SearchDriver::Unescape($3.text_); - $$ = driver.AnalyzeAndBuildQueryNode(std::move(field), std::move(text), $3.from_quoted_); + $$ = driver.AnalyzeAndBuildQueryNode(field, text, $3.from_quoted_); }; | STRING TILDE { const std::string &field = default_field; @@ -155,12 +155,12 @@ basic_filter YYERROR; } std::string text = SearchDriver::Unescape($1.text_); - $$ = driver.AnalyzeAndBuildQueryNode(field, std::move(text), $1.from_quoted_, $2); + $$ = driver.AnalyzeAndBuildQueryNode(field, text, $1.from_quoted_, $2); } | STRING OP_COLON STRING TILDE { std::string field = SearchDriver::Unescape($1.text_); std::string text = SearchDriver::Unescape($3.text_); - $$ = driver.AnalyzeAndBuildQueryNode(std::move(field), std::move(text), $3.from_quoted_, $4); + $$ = driver.AnalyzeAndBuildQueryNode(field, text, $3.from_quoted_, $4); }; %% diff --git a/src/storage/invertedindex/search/doc_iterator.cppm b/src/storage/invertedindex/search/doc_iterator.cppm index 59359db6c0..60771a4fb2 100644 --- a/src/storage/invertedindex/search/doc_iterator.cppm +++ b/src/storage/invertedindex/search/doc_iterator.cppm @@ -36,6 +36,7 @@ export enum class DocIteratorType : u8 { kBMWIterator, kFilterIterator, kScoreThresholdIterator, + kKeywordIterator, }; export struct DocIteratorEstimateIterateCost { diff --git a/src/storage/invertedindex/search/keyword_iterator.cppm b/src/storage/invertedindex/search/keyword_iterator.cppm new file mode 100644 index 0000000000..527fc26ae6 --- /dev/null +++ b/src/storage/invertedindex/search/keyword_iterator.cppm @@ -0,0 +1,71 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +export module keyword_iterator; + +import stl; +import index_defines; +import doc_iterator; +import multi_doc_iterator; +import or_iterator; +import internal_types; + +namespace infinity { + +export class KeywordIterator final : public MultiDocIterator { +public: + KeywordIterator(Vector> iterators, const float weight) : MultiDocIterator(std::move(iterators)), weight_(weight) {} + + DocIteratorType GetType() const override { return DocIteratorType::kKeywordIterator; } + + String Name() const override { return "KeywordIterator"; } + + /* pure virtual methods implementation */ + bool Next(const RowID doc_id) override { + if (doc_id_ == INVALID_ROWID) { + for (u32 i = 0; i < children_.size(); ++i) { + children_[i]->Next(); + DocIteratorEntry entry = {children_[i]->DocID(), i}; + heap_.AddEntry(entry); + } + heap_.BuildHeap(); + doc_id_ = heap_.TopEntry().doc_id_; + } + if (doc_id_ != INVALID_ROWID && doc_id_ >= doc_id) { + return true; + } + while (doc_id > heap_.TopEntry().doc_id_) { + DocIterator *top = children_[heap_.TopEntry().entry_id_].get(); + top->Next(doc_id); + heap_.TopEntry().doc_id_ = top->DocID(); + heap_.AdjustDown(1); + } + doc_id_ = heap_.TopEntry().doc_id_; + return doc_id_ != INVALID_ROWID; + } + + float Score() override { return weight_; } + + void UpdateScoreThreshold(float threshold) override { /* do nothing */ } + + u32 MatchCount() const override { return 0; } + +private: + const float weight_ = 1.0f; + DocIteratorHeap heap_{}; +}; + +} // namespace infinity diff --git a/src/storage/invertedindex/search/query_node.cpp b/src/storage/invertedindex/search/query_node.cpp index aff687e9e6..bd19c7bdad 100644 --- a/src/storage/invertedindex/search/query_node.cpp +++ b/src/storage/invertedindex/search/query_node.cpp @@ -22,6 +22,7 @@ import phrase_doc_iterator; import blockmax_wand_iterator; import minimum_should_match_iterator; import parse_fulltext_options; +import keyword_iterator; namespace infinity { @@ -34,8 +35,7 @@ namespace infinity { // 4. "and_not" does not exist in parser output, it is generated during optimization // "and_not": first child can be term, "and", "or", other children form a list of "not" void QueryNode::FilterOptimizeQueryTree() { - String error_message = "Should not reach here!"; - UnrecoverableError(error_message); + UnrecoverableError("Should not reach here!"); } std::unique_ptr QueryNode::GetOptimizedQueryTree(std::unique_ptr root) { @@ -43,48 +43,51 @@ std::unique_ptr QueryNode::GetOptimizedQueryTree(std::unique_ptrFilterOptimizeQueryTree(); return root; } - auto start_time = std::chrono::high_resolution_clock::now(); + std::chrono::time_point start_time{}; + if (SHOULD_LOG_DEBUG()) { + start_time = std::chrono::high_resolution_clock::now(); + OStringStream oss; + oss << "Query tree before optimization:\n"; + if (root) { + root->PrintTree(oss); + } else { + oss << "Empty query tree!\n"; + } + LOG_DEBUG(std::move(oss).str()); + } std::unique_ptr optimized_root; if (!root) { - Status status = Status::SyntaxError("Invalid query statement: Empty query tree"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("Invalid query statement: Empty query tree")); } // push down the weight to the leaf term node root->PushDownWeight(); // optimize the query tree switch (root->GetType()) { - case QueryNodeType::TERM: { - // no need to optimize - optimized_root = std::move(root); - break; - } - case QueryNodeType::PHRASE: { + case QueryNodeType::TERM: + case QueryNodeType::PHRASE: + case QueryNodeType::KEYWORD: { // no need to optimize optimized_root = std::move(root); break; } case QueryNodeType::NOT: { - Status status = Status::SyntaxError("Invalid query statement: NotQueryNode should not be on the top level"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("Invalid query statement: NotQueryNode should not be on the top level")); break; } case QueryNodeType::AND: case QueryNodeType::OR: { optimized_root = static_cast(root.get())->GetNewOptimizedQueryTree(); if (optimized_root->GetType() == QueryNodeType::NOT) { - Status status = Status::SyntaxError("Invalid query statement: NotQueryNode should not be on the top level"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("Invalid query statement: NotQueryNode should not be on the top level")); } break; } case QueryNodeType::AND_NOT: { - String error_message = "Unexpected AndNotQueryNode from parser output"; - UnrecoverableError(error_message); + UnrecoverableError("Unexpected AndNotQueryNode from parser output"); break; } default: { - String error_message = "GetOptimizedQueryTree: Unexpected case!"; - UnrecoverableError(error_message); + UnrecoverableError("GetOptimizedQueryTree: Unexpected case!"); break; } } @@ -107,16 +110,13 @@ std::unique_ptr QueryNode::GetOptimizedQueryTree(std::unique_ptr MultiQueryNode::GetNewOptimizedQueryTree() { for (auto &child : children_) { switch (child->GetType()) { - case QueryNodeType::TERM: { - // no need to optimize - break; - } + case QueryNodeType::TERM: case QueryNodeType::PHRASE: { + // no need to optimize break; } case QueryNodeType::AND_NOT: { - String error_message = "GetNewOptimizedQueryTree: Unexpected case! AndNotQueryNode should not exist in parser output"; - UnrecoverableError(error_message); + UnrecoverableError("GetNewOptimizedQueryTree: Unexpected case! AndNotQueryNode should not exist in parser output"); break; } case QueryNodeType::NOT: @@ -127,8 +127,7 @@ std::unique_ptr MultiQueryNode::GetNewOptimizedQueryTree() { break; } default: { - String error_message = "GetNewOptimizedQueryTree: Unexpected case!"; - UnrecoverableError(error_message); + UnrecoverableError("GetNewOptimizedQueryTree: Unexpected case!"); break; } } @@ -152,16 +151,14 @@ std::unique_ptr MultiQueryNode::GetNewOptimizedQueryTree() { std::unique_ptr NotQueryNode::InnerGetNewOptimizedQueryTree() { if (children_.empty()) { - String error_message = "Invalid query statement: NotQueryNode should have at least 1 children"; - UnrecoverableError(error_message); + UnrecoverableError("Invalid query statement: NotQueryNode should have at least 1 children"); } auto new_not_node = std::make_unique(); // new node, weight is reset to 1.0 auto &new_not_list = new_not_node->children_; for (auto &child : children_) { switch (child->GetType()) { case QueryNodeType::NOT: { - Status status = Status::SyntaxError("Invalid query statement: NotQueryNode should not have not child"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("Invalid query statement: NotQueryNode should not have not child")); break; } case QueryNodeType::TERM: @@ -179,8 +176,7 @@ std::unique_ptr NotQueryNode::InnerGetNewOptimizedQueryTree() { break; } default: { - String error_message = "OptimizeInPlaceInner: Unexpected case!"; - UnrecoverableError(error_message); + UnrecoverableError("OptimizeInPlaceInner: Unexpected case!"); break; } } @@ -203,8 +199,7 @@ std::unique_ptr NotQueryNode::InnerGetNewOptimizedQueryTree() { std::unique_ptr AndQueryNode::InnerGetNewOptimizedQueryTree() { if (children_.size() < 2) { - String error_message = "Invalid query statement: AndQueryNode should have at least 2 children"; - UnrecoverableError(error_message); + UnrecoverableError("Invalid query statement: AndQueryNode should have at least 2 children"); } std::vector> and_list; std::vector> not_list; @@ -247,8 +242,7 @@ std::unique_ptr AndQueryNode::InnerGetNewOptimizedQueryTree() { break; } default: { - String error_message = "OptimizeInPlaceInner: Unexpected case!"; - UnrecoverableError(error_message); + UnrecoverableError("OptimizeInPlaceInner: Unexpected case!"); break; } } @@ -296,8 +290,7 @@ std::unique_ptr AndQueryNode::InnerGetNewOptimizedQueryTree() { std::unique_ptr OrQueryNode::InnerGetNewOptimizedQueryTree() { if (children_.size() < 2) { - String error_message = "OptimizeInPlaceInner: Unexpected case! AndNotQueryNode should not exist in parser output"; - UnrecoverableError(error_message); + UnrecoverableError("Invalid query statement: OrQueryNode should have at least 2 children"); } std::vector> or_list; std::vector> not_list; @@ -321,8 +314,7 @@ std::unique_ptr OrQueryNode::InnerGetNewOptimizedQueryTree() { break; } default: { - String error_message = "OptimizeInPlaceInner: Unexpected case!"; - UnrecoverableError(error_message); + UnrecoverableError("OptimizeInPlaceInner: Unexpected case!"); break; } } @@ -387,8 +379,7 @@ std::unique_ptr OrQueryNode::InnerGetNewOptimizedQueryTree() { or_node->children_ = std::move(or_list); return or_node; } else { - Status status = Status::SyntaxError("Invalid query statement: OrQueryNode should not have both not child and non-not child"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("Invalid query statement: OrQueryNode should not have both not child and non-not child")); return nullptr; } } @@ -397,8 +388,12 @@ std::unique_ptr OrQueryNode::InnerGetNewOptimizedQueryTree() { // "and_not" does not exist in parser output, it is generated during optimization std::unique_ptr AndNotQueryNode::InnerGetNewOptimizedQueryTree() { - String error_message = "OptimizeInPlaceInner: Unexpected case! AndNotQueryNode should not exist in parser output"; - UnrecoverableError(error_message); + UnrecoverableError("OptimizeInPlaceInner: Unexpected case! AndNotQueryNode should not exist in parser output"); + return nullptr; +} + +std::unique_ptr KeywordQueryNode::InnerGetNewOptimizedQueryTree() { + UnrecoverableError(std::format("{}: Should not reach here!", __func__)); return nullptr; } @@ -556,6 +551,23 @@ std::unique_ptr OrQueryNode::CreateSearch(const CreateSearchParams } } +std::unique_ptr KeywordQueryNode::CreateSearch(const CreateSearchParams params) const { + Vector> sub_doc_iters; + sub_doc_iters.reserve(children_.size()); + for (const auto &child : children_) { + if (child->GetType() != QueryNodeType::TERM) { + UnrecoverableError("KeywordQueryNode should only contain term children"); + } + if (auto iter = child->CreateSearch(params); iter) { + sub_doc_iters.emplace_back(std::move(iter)); + } + } + if (sub_doc_iters.empty()) { + return nullptr; + } + return std::make_unique(std::move(sub_doc_iters), GetWeight()); +} + std::unique_ptr NotQueryNode::CreateSearch(CreateSearchParams) const { UnrecoverableError("NOT query node should be optimized into AND_NOT query node"); return nullptr; @@ -563,7 +575,7 @@ std::unique_ptr NotQueryNode::CreateSearch(CreateSearchParams) cons // print tree -std::string QueryNodeTypeToString(QueryNodeType type) { +std::string QueryNodeTypeToString(const QueryNodeType type) { switch (type) { case QueryNodeType::INVALID: return "INVALID"; @@ -571,6 +583,8 @@ std::string QueryNodeTypeToString(QueryNodeType type) { return "FILTER"; case QueryNodeType::TERM: return "TERM"; + case QueryNodeType::KEYWORD: + return "KEYWORD"; case QueryNodeType::AND: return "AND"; case QueryNodeType::AND_NOT: @@ -590,7 +604,7 @@ std::string QueryNodeTypeToString(QueryNodeType type) { } } -void TermQueryNode::PrintTree(std::ostream &os, const std::string &prefix, bool is_final) const { +void TermQueryNode::PrintTree(std::ostream &os, const std::string &prefix, const bool is_final) const { os << prefix; os << (is_final ? "└──" : "├──"); os << QueryNodeTypeToString(type_); @@ -605,7 +619,7 @@ void TermQueryNode::GetQueryColumnsTerms(std::vector &columns, std: terms.push_back(term_); } -void PhraseQueryNode::PrintTree(std::ostream &os, const std::string &prefix, bool is_final) const { +void PhraseQueryNode::PrintTree(std::ostream &os, const std::string &prefix, const bool is_final) const { os << prefix; os << (is_final ? "└──" : "├──"); os << QueryNodeTypeToString(type_); @@ -627,14 +641,14 @@ void PhraseQueryNode::GetQueryColumnsTerms(std::vector &columns, st } } -void MultiQueryNode::PrintTree(std::ostream &os, const std::string &prefix, bool is_final) const { +void MultiQueryNode::PrintTree(std::ostream &os, const std::string &prefix, const bool is_final) const { os << prefix; os << (is_final ? "└──" : "├──"); os << QueryNodeTypeToString(type_); os << " (weight: " << weight_ << ")"; os << " (children count: " << children_.size() << ")"; os << '\n'; - std::string next_prefix = prefix + (is_final ? " " : "│ "); + const std::string next_prefix = prefix + (is_final ? " " : "│ "); for (u32 i = 0; i + 1 < children_.size(); ++i) { children_[i]->PrintTree(os, next_prefix, false); } @@ -648,13 +662,24 @@ void MultiQueryNode::GetQueryColumnsTerms(std::vector &columns, std } uint32_t MultiQueryNode::LeafCount() const { - if (GetType() != QueryNodeType::OR && GetType() != QueryNodeType::AND) { - UnrecoverableError("LeafCount: Unexpected case!"); + switch (GetType()) { + case QueryNodeType::OR: + case QueryNodeType::AND: { + return std::accumulate(children_.begin(), children_.end(), static_cast(0), [](const u32 cnt, const auto &it) { + return cnt + it->LeafCount(); + }); + } + case QueryNodeType::AND_NOT: { + return children_.front()->LeafCount(); + } + case QueryNodeType::KEYWORD: { + return 0; + } + default: { + UnrecoverableError("LeafCount: Unexpected case!"); + return {}; + } } - return std::accumulate(children_.begin(), children_.end(), static_cast(0), [](const u32 cnt, const auto &it) { - return cnt + it->LeafCount(); - }); } - } // namespace infinity diff --git a/src/storage/invertedindex/search/query_node.h b/src/storage/invertedindex/search/query_node.h index b5a6429f7a..e053441ac4 100644 --- a/src/storage/invertedindex/search/query_node.h +++ b/src/storage/invertedindex/search/query_node.h @@ -37,6 +37,7 @@ enum class QueryNodeType : char { AND, AND_NOT, OR, + KEYWORD, // unimplemented: PREFIX_TERM, SUFFIX_TERM, @@ -137,7 +138,7 @@ struct MultiQueryNode : public QueryNode { void Add(std::unique_ptr &&node) { children_.emplace_back(std::move(node)); } uint32_t LeafCount() const override; - void PushDownWeight(float factor) final { + void PushDownWeight(float factor) override { // no need to update weight for MultiQueryNode, because it will be reset to 1.0 factor *= GetWeight(); for (auto &child : children_) { @@ -180,9 +181,14 @@ struct OrQueryNode final : public MultiQueryNode { std::unique_ptr CreateSearch(CreateSearchParams params) const override; }; +struct KeywordQueryNode final : public MultiQueryNode { + KeywordQueryNode() : MultiQueryNode(QueryNodeType::KEYWORD) {} + void PushDownWeight(float factor) override { MultiplyWeight(factor); } + std::unique_ptr InnerGetNewOptimizedQueryTree() override; + std::unique_ptr CreateSearch(CreateSearchParams params) const override; +}; + // unimplemented -struct WandQueryNode; -// struct PhraseQueryNode; struct PrefixTermQueryNode; struct SuffixTermQueryNode; struct SubstringTermQueryNode; diff --git a/src/storage/invertedindex/search/search_driver.cpp b/src/storage/invertedindex/search/search_driver.cpp index d924fd1c5b..7c5e0b3c4f 100644 --- a/src/storage/invertedindex/search/search_driver.cpp +++ b/src/storage/invertedindex/search/search_driver.cpp @@ -23,9 +23,9 @@ #define SearchScannerSuffix InfinitySyntax #include "search_scanner_derived_helper.h" #undef SearchScannerSuffix -#define SearchScannerSuffix Plain -#include "search_scanner_derived_helper.h" -#undef SearchScannerSuffix +// #define SearchScannerSuffix Plain +// #include "search_scanner_derived_helper.h" +// #undef SearchScannerSuffix import stl; import term; @@ -113,65 +113,14 @@ std::unique_ptr SearchDriver::ParseSingleWithFields(const std::string return parsed_query_tree; } -std::unique_ptr SearchDriver::ParseSingle(const std::string &query, const std::string *default_field_ptr) const { - std::istringstream iss(query); - if (!iss.good()) { - return nullptr; - } - if (!default_field_ptr) { - default_field_ptr = &default_field_; - } - std::unique_ptr scanner; - std::unique_ptr parser; - std::unique_ptr result; - try { - switch (operator_option_) { - case FulltextQueryOperatorOption::kInfinitySyntax: - scanner = std::make_unique(&iss); - break; - case FulltextQueryOperatorOption::kAnd: - case FulltextQueryOperatorOption::kOr: - scanner = std::make_unique(&iss); - break; - } - parser = std::make_unique(*scanner, *this, *default_field_ptr, result); - } catch (std::bad_alloc &ba) { - std::cerr << "Failed to allocate: (" << ba.what() << "), exiting!!\n"; - return nullptr; - } - constexpr int accept = 0; - if (parser->parse() != accept) { - return nullptr; - } - return result; -} - -std::unique_ptr -SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, std::string &&text, bool from_quoted, unsigned long slop) const { - if (text.empty()) { - Status status = Status::SyntaxError("Empty query text"); - RecoverableError(status); - return nullptr; - } +inline TermList GetTermListFromAnalyzer(const std::string &analyzer_name, Analyzer *analyzer, const std::string &query_str) { + TermList result; Term input_term; - input_term.text_ = std::move(text); - TermList terms; - - // 1. analyze - std::string analyzer_name = "standard"; - if (!field.empty()) { - if (auto it = field2analyzer_.find(field); it != field2analyzer_.end()) { - analyzer_name = it->second; - } - } - auto [analyzer, status] = AnalyzerPool::instance().GetAnalyzer(analyzer_name); - if (!status.ok()) { - RecoverableError(status); - } + input_term.text_ = query_str; TermList temp_output_terms; analyzer->Analyze(input_term, temp_output_terms); // remove duplicates and only keep the root words for query - const u32 INVALID_TERM_OFFSET = -1; + constexpr u32 INVALID_TERM_OFFSET = -1; Term last_term; last_term.word_offset_ = INVALID_TERM_OFFSET; for (const Term &term : temp_output_terms) { @@ -180,7 +129,7 @@ SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, std::string &&t } if (last_term.word_offset_ != term.word_offset_) { if (last_term.word_offset_ != INVALID_TERM_OFFSET) { - terms.emplace_back(last_term); + result.emplace_back(last_term); } last_term.text_ = term.text_; last_term.word_offset_ = term.word_offset_; @@ -193,62 +142,116 @@ SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, std::string &&t } } if (last_term.word_offset_ != INVALID_TERM_OFFSET) { - terms.emplace_back(last_term); + result.emplace_back(last_term); } - if (terms.empty()) { - std::cerr << "Analyzer " << analyzer_name << " analyzes following text as empty terms: " << input_term.text_ << std::endl; + if (result.empty()) { + std::cerr << std::format("Analyzer {} analyzes following text as empty terms: {}\n", analyzer_name, query_str); } + return result; +} + +inline std::string GetAnalyzerName(const std::string &field, const std::map &field2analyzer) { + std::string analyzer_name = "standard"; + if (!field.empty()) { + if (const auto it = field2analyzer.find(field); it != field2analyzer.end()) { + analyzer_name = it->second; + } + } + return analyzer_name; +} + +std::unique_ptr SearchDriver::ParseSingle(const std::string &query, const std::string *default_field_ptr) const { + std::istringstream iss(query); + if (!iss.good()) { + return nullptr; + } + if (!default_field_ptr) { + default_field_ptr = &default_field_; + } + const auto &default_field = *default_field_ptr; + const auto default_analyzer_name = GetAnalyzerName(default_field, field2analyzer_); + if (default_analyzer_name != "keyword" && operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax) { + // use parser + std::unique_ptr result; + const auto scanner = std::make_unique(&iss); + const auto parser = std::make_unique(*scanner, *this, *default_field_ptr, result); + if (constexpr int accept = 0; parser->parse() != accept) { + return nullptr; + } + return result; + } else { + // use analyzer for the whole query string + auto [analyzer, status] = AnalyzerPool::instance().GetAnalyzer(default_analyzer_name); + if (!status.ok()) { + RecoverableError(std::move(status)); + } + TermList terms = GetTermListFromAnalyzer(default_analyzer_name, analyzer.get(), query); + std::unique_ptr multi_query; + if (default_analyzer_name == "keyword") { + multi_query = std::make_unique(); + } else if (operator_option_ == FulltextQueryOperatorOption::kOr) { + multi_query = std::make_unique(); + } else if (operator_option_ == FulltextQueryOperatorOption::kAnd) { + multi_query = std::make_unique(); + } + for (const auto &term : terms) { + auto subquery = std::make_unique(); + subquery->term_ = term.text_; + subquery->column_ = default_field; + multi_query->Add(std::move(subquery)); + } + return multi_query; + } +} + +std::unique_ptr +SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, const std::string &text, const bool from_quoted, const unsigned long slop) const { + assert(operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax); + if (text.empty()) { + RecoverableError(Status::SyntaxError("Empty query text")); + return nullptr; + } + // 1. analyze + const auto analyzer_name = GetAnalyzerName(field, field2analyzer_); + auto [analyzer, status] = AnalyzerPool::instance().GetAnalyzer(analyzer_name); + if (!status.ok()) { + RecoverableError(std::move(status)); + } + TermList terms = GetTermListFromAnalyzer(analyzer_name, analyzer.get(), text); // 2. build query node if (terms.empty()) { auto result = std::make_unique(); - result->term_ = std::move(input_term.text_); + result->term_ = text; result->column_ = field; return result; } else if (terms.size() == 1) { auto result = std::make_unique(); - result->term_ = std::move(terms.front().text_); + result->term_ = terms.front().text_; result->column_ = field; return result; } else { if (from_quoted) { auto result = std::make_unique(); - for (auto term : terms) { - result->AddTerm(term.Text()); + for (const auto &term : terms) { + result->AddTerm(term.text_); } result->column_ = field; result->slop_ = slop; return result; } else { - auto result = GetMultiQueryNodeByOperatorOption(); - auto *multi_query_ptr = dynamic_cast(result.get()); - for (auto &term : terms) { + auto result = std::make_unique(); + for (const auto &term : terms) { auto subquery = std::make_unique(); - subquery->term_ = std::move(term.text_); + subquery->term_ = term.text_; subquery->column_ = field; - multi_query_ptr->Add(std::move(subquery)); + result->Add(std::move(subquery)); } return result; } } } -std::unique_ptr SearchDriver::GetMultiQueryNodeByOperatorOption() const { - switch (operator_option_) { - case FulltextQueryOperatorOption::kInfinitySyntax: // treat it as OR - case FulltextQueryOperatorOption::kOr: { - return std::make_unique(); - break; - } - case FulltextQueryOperatorOption::kAnd: { - return std::make_unique(); - break; - } - } - UnrecoverableError("Invalid switch case!"); - return {}; -} - // Unescape reserved characters per https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html // Shall keep sync with ESCAPEABLE in search_lexer.l // [\x20+\-=&|!(){}\[\]^"~*?:\\/] diff --git a/src/storage/invertedindex/search/search_driver.h b/src/storage/invertedindex/search/search_driver.h index e4cdfbf976..03a941ae23 100644 --- a/src/storage/invertedindex/search/search_driver.h +++ b/src/storage/invertedindex/search/search_driver.h @@ -46,10 +46,7 @@ class SearchDriver { // used in SearchParser in ParseSingle. Assumes field and text are both unescaped. [[nodiscard]] std::unique_ptr - AnalyzeAndBuildQueryNode(const std::string &field, std::string &&text, bool from_quoted, unsigned long slop = 0) const; - - // helper function for building query tree, used in search_parser.y and AnalyzeAndBuildQueryNode - [[nodiscard]] std::unique_ptr GetMultiQueryNodeByOperatorOption() const; + AnalyzeAndBuildQueryNode(const std::string &field, const std::string &text, bool from_quoted, unsigned long slop = 0) const; [[nodiscard]] static std::string Unescape(const std::string &text); diff --git a/test/sql/dql/fulltext/fulltext_keyword.slt b/test/sql/dql/fulltext/fulltext_keyword.slt new file mode 100644 index 0000000000..f951282393 --- /dev/null +++ b/test/sql/dql/fulltext/fulltext_keyword.slt @@ -0,0 +1,44 @@ + +statement ok +DROP TABLE IF EXISTS ft_keyword; + +statement ok +CREATE TABLE ft_keyword(num int, doc varchar); + +# copy data from csv file +statement ok +COPY ft_keyword FROM '/var/infinity/test_data/fulltext_delete.csv' WITH ( DELIMITER '\t', FORMAT CSV ); + +statement ok +INSERT INTO ft_keyword VALUES (4, 'another second text xxx'), (5, 'another multiple'); + +statement ok +CREATE INDEX ft_index ON ft_keyword(doc) USING FULLTEXT WITH (analyzer = keyword); + +query I +SELECT * FROM ft_keyword; +---- +1 first text +2 second text multiple +3 third text many words +4 another second text xxx +5 another multiple + +query I +SELECT *, SCORE() FROM ft_keyword SEARCH MATCH TEXT ('doc', 'second text', 'topn=10'); +---- +1 first text 1.000000 +2 second text multiple 1.000000 +3 third text many words 1.000000 +4 another second text xxx 1.000000 + +query I +SELECT *, SCORE() FROM ft_keyword SEARCH MATCH TEXT ('doc^4.5', 'multiple another', 'topn=10'); +---- +2 second text multiple 4.500000 +4 another second text xxx 4.500000 +5 another multiple 4.500000 + +# Clean up +statement ok +DROP TABLE ft_keyword;