Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement prefix completion the new way #283

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions keyvi/include/keyvi/dictionary/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "keyvi/dictionary/match_iterator.h"
#include "keyvi/dictionary/matching/fuzzy_matching.h"
#include "keyvi/dictionary/matching/near_matching.h"
#include "keyvi/dictionary/matching/prefix_completion_matching.h"

// #define ENABLE_TRACING
#include "keyvi/dictionary/util/trace.h"
Expand Down Expand Up @@ -324,6 +325,14 @@ class Dictionary final {
return MatchIterator::MakeIteratorPair(func, data->FirstMatch());
}

MatchIterator::MatchIteratorPair GetPrefixCompletion(const std::string& query) const {
auto data = std::make_shared<matching::PrefixCompletionMatching<>>(
matching::PrefixCompletionMatching<>::FromSingleFsa(fsa_, query));

auto func = [data]() { return data->NextMatch(); };
return MatchIterator::MakeIteratorPair(func, data->FirstMatch());
}

std::string GetManifest() const { return fsa_->GetManifest(); }

private:
Expand Down
223 changes: 223 additions & 0 deletions keyvi/include/keyvi/dictionary/matching/prefix_completion_matching.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
/* keyvi - A key value store.
*
* Copyright 2024 Hendrik Muhs<hendrik.muhs@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* prefix_completion_matching.h
*/

#ifndef KEYVI_DICTIONARY_MATCHING_PREFIX_COMPLETION_MATCHING_H_
#define KEYVI_DICTIONARY_MATCHING_PREFIX_COMPLETION_MATCHING_H_

#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "keyvi/dictionary/fsa/automata.h"
#include "keyvi/dictionary/fsa/codepoint_state_traverser.h"
#include "keyvi/dictionary/fsa/traverser_types.h"
#include "keyvi/dictionary/fsa/zip_state_traverser.h"
#include "keyvi/dictionary/match.h"
#include "keyvi/dictionary/util/utf8_utils.h"
#include "keyvi/stringdistance/levenshtein.h"
#include "utf8.h"

// #define ENABLE_TRACING
#include "keyvi/dictionary/util/trace.h"

namespace keyvi {
namespace index {
namespace internal {
template <class MatcherT, class DeletedT>
keyvi::dictionary::Match NextFilteredMatchSingle(const MatcherT&, const DeletedT&);
template <class MatcherT, class DeletedT>
keyvi::dictionary::Match NextFilteredMatch(const MatcherT&, const DeletedT&);
} // namespace internal
} // namespace index
namespace dictionary {
namespace matching {

template <class innerTraverserType = fsa::WeightedStateTraverser>
class PrefixCompletionMatching final {
public:
/**
* Create a prefix completer from a single Fsa
*
* @param fsa the fsa
* @param query the query
*/
static PrefixCompletionMatching FromSingleFsa(const fsa::automata_t& fsa, const std::string& query) {
return FromSingleFsa(fsa, fsa->GetStartState(), query);
}

/**
* Create a prefix completer from a single Fsa
*
* @param fsa the fsa
* @param start_state the state to start from
* @param query the query
*/
static PrefixCompletionMatching FromSingleFsa(const fsa::automata_t& fsa, const uint64_t start_state,
const std::string& query) {
if (start_state == 0) {
return PrefixCompletionMatching();
}

std::unique_ptr<std::vector<unsigned char>> traversal_stack;
traversal_stack.reset(new std::vector<unsigned char>());
traversal_stack->reserve(1024);

const size_t query_length = query.size();
size_t depth = 0;
uint64_t state = start_state;

Match first_match;

TRACE("start state %d", state);

while (state != 0 && depth != query_length) {
traversal_stack->push_back(query[depth]);
state = fsa->TryWalkTransition(state, query[depth++]);
}

TRACE("state %d", state);

if (depth != query_length) {
return PrefixCompletionMatching();
}

TRACE("matched prefix, length %d", depth);

if (fsa->IsFinalState(state)) {
first_match = Match(0, query_length, query, 0, fsa, fsa->GetStateValue(state));
}

std::shared_ptr<std::string> prefix = std::make_shared<std::string>(query);
std::unique_ptr<innerTraverserType> traverser;
traverser.reset(new innerTraverserType(fsa, state));

TRACE("create matcher");
return PrefixCompletionMatching(std::move(traverser), std::move(first_match), std::move(traversal_stack),
query_length);
}

/**
* Create a prefix completer from multiple Fsas
*
* @param fsas a vector of fsas
* @param query the query
*/
static PrefixCompletionMatching FromMulipleFsas(const std::vector<fsa::automata_t>& fsas, const std::string& query) {
const size_t query_length = query.size();
std::vector<std::pair<fsa::automata_t, uint64_t>> fsa_start_state_pairs;

for (const fsa::automata_t& fsa : fsas) {
uint64_t state = fsa->GetStartState();

size_t depth = 0;
while (state != 0 && depth != query_length) {
state = fsa->TryWalkTransition(state, query[depth++]);
}

if (depth == query_length) {
fsa_start_state_pairs.emplace_back(fsa, state);
}
}

if (fsa_start_state_pairs.size() == 0) {
return PrefixCompletionMatching();
}

// create the traversal stack
std::unique_ptr<std::vector<unsigned char>> traversal_stack;
traversal_stack.reset(new std::vector<unsigned char>());
traversal_stack->reserve(1024);

for (const char& c : query) {
traversal_stack->push_back(c);
}

Match first_match;
// check for a match given the exact prefix
for (const auto& fsa_state : fsa_start_state_pairs) {
if (fsa_state.first->IsFinalState(fsa_state.second)) {
first_match =
Match(0, query_length, query, 0, fsa_state.first, fsa_state.first->GetStateValue(fsa_state.second));
break;
}
}

std::unique_ptr<innerTraverserType> traverser;
traverser.reset(new innerTraverserType(fsa_start_state_pairs));

return PrefixCompletionMatching(std::move(traverser), std::move(first_match), std::move(traversal_stack),
query_length);
}

Match FirstMatch() const { return first_match_; }

Match NextMatch() {
for (; traverser_ptr_ && *traverser_ptr_; (*traverser_ptr_)++) {
traversal_stack_->resize(prefix_length_ + traverser_ptr_->GetDepth() - 1);
traversal_stack_->push_back(traverser_ptr_->GetStateLabel());
TRACE("Current depth %d (%d)", prefix_length_ + traverser_ptr_->GetDepth() - 1, traversal_stack_->size());

if (traverser_ptr_->IsFinalState()) {
std::string match_str = std::string(traversal_stack_->begin(), traversal_stack_->end());

TRACE("found final state at depth %d %s", prefix_length_ + traverser_ptr_->GetDepth(), match_str.c_str());
Match m(0, prefix_length_ + traverser_ptr_->GetDepth(), match_str, 0, traverser_ptr_->GetFsa(),
traverser_ptr_->GetStateValue());

(*traverser_ptr_)++;
return m;
}
}

return Match();
}

private:
PrefixCompletionMatching(std::unique_ptr<innerTraverserType>&& traverser, Match&& first_match,
std::unique_ptr<std::vector<unsigned char>>&& traversal_stack, const size_t prefix_length)
: traverser_ptr_(std::move(traverser)),
first_match_(std::move(first_match)),
traversal_stack_(std::move(traversal_stack)),
prefix_length_(prefix_length) {}

PrefixCompletionMatching() {}

private:
std::unique_ptr<innerTraverserType> traverser_ptr_;
const Match first_match_;
std::unique_ptr<std::vector<unsigned char>> traversal_stack_;
const size_t prefix_length_ = 0;

// reset method for the index in the special case the match is deleted
template <class MatcherT, class DeletedT>
friend Match index::internal::NextFilteredMatchSingle(const MatcherT&, const DeletedT&);
template <class MatcherT, class DeletedT>
friend Match index::internal::NextFilteredMatch(const MatcherT&, const DeletedT&);

void ResetLastMatch() {}
};

} /* namespace matching */
} /* namespace dictionary */
} /* namespace keyvi */
#endif // KEYVI_DICTIONARY_MATCHING_PREFIX_COMPLETION_MATCHING_H_
Loading
Loading