diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk new file mode 100644 index 000000000..a7c0e83cb --- /dev/null +++ b/doc/aho_corasick.qbk @@ -0,0 +1,143 @@ +[/ QuickBook Document version 1.5 ] + +[section:AhoCorasick Aho-Corasick Search] + +[/license + +Copyright (c) 2016 Alexander Zaitsev + +Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or copy at +http://www.boost.org/LICENSE_1_0.txt) +] + + +[heading Overview] + +The header file 'aho_corasick.hpp' contains an implementation of the Aho-Corasick algorithm for searching sequences of values. It is primarily used to search for multiple patterns within a corpus. + +The Aho-Corasick algorithm works by building a trie (a tree with each node corresponding to an object) of the patterns sequences and traversing the trie to search for the pattern in a given corpus sequence. Additionally, the Aho-Corasick introduced the concept of "failure pointer/failure node" which is the node to be traversed when there is a mismatch. + +The algorithm was conceived in 1975 by Alfred V. Aho and Margaret J. Corasick. Their paper "Efficient string matching: An aid to bibliographic search" was published in the Communications of the ACM. + +Nomenclature: The nomenclature is similar to that of the Knuth Morris Pratt implementation in Boost.Algorithm. The sequence being searched for is referred to as the "pattern", and the sequence being searched in is referred to as the "corpus". + +See more in "Set Matching and Aho–Corasick Algorithm", lecture slides by Pekka Kilpeläinen(http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf). + +[heading Interface] + +For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses 'find()' to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. + +The header file 'aho_corasick.hpp' contains two versions of Aho-Corasick: based on std::map and std::unordered_map. Also there is class AhoCorasick, which you can customize. For every version this header file provide functional and object-based interfaces. + +Procedural interfaces: + +Procedural interfaces provide interfaces based on iterators. + +For Aho-Corasick based on std::map: + +`` +template , typename RAIterator, + typename ForwardIterator, typename Callback> +bool aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + Callback cb); +`` + +For Aho-Corasick based on std::unordered_map: +`` +template , typename Comp = std::equal_to, typename RAIterator, + typename ForwardIterator, typename Callback> +bool aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + Callback cb); +`` + + + +Object interface (typedefs): +`` +template > +using Aho_Corasick_Map = AhoCorasick; + +template , typename Comp = std::equal_to> +using Aho_Corasick_HashMap = AhoCorasick; +`` + +Interface (constructors, etc.) are equal for Aho_Corasick_Map, Aho_Corasick_HashMap and basical AhoCorasick: +`` +AhoCorasick(); + +template +explicit AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); + +template +explicit AhoCorasick(const Range& range); + +template +void insert(ForwardIterator begin, ForwardIterator end); + +template +void insert(const Range& range); + +template +bool find(RAIterator begin, RAIterator end, Callback cb); +`` + +[heading Return value] + +The 'find' method returns true, if all Callback callings return true, otherwise returns false. + +[heading Requirements] + +C++11-compatible compiler required. + +For Aho_Corasick_HashMap and aho_corasick_hashmap: by default use std::hash for Hash and std::equal_to as Comparator. If you type doesn't support it, you must use your own functions for this. Without Hash and Comparator algorithm doesn't work. + +For Aho_Corasick_Map and aho_corasick_map: by default use std::less as Predicate. If you type doesn't support it, you must use your own functions for this. Without Predicate algorithm doesn't work. + +[heading Performance] + +Performance of Aho_Corasick_Map and Aho_Corasick_HashMap is similar on small alphabets. On large alphabets Aho_Corasick_HashMap is faster than Aho_Corasick_Map. Remember, that getting hash of element is slow operation. Also if you use Aho_Corasick_HashMap, std::unordered_map can sometimes do rehash with O(Alphabet). + +[heading Memory Use] + +Every node of trie consist of container of std::shared_ptr to trie nodes, which you choose(std::map, std::unordered_map or maybe something else), two std::shared_ptr to trie nodes and std:vector of length of patterns, which that ends in this node. Count of nodes is linear in the sum of the length of the patterns. + +[heading Complexity] + +Nomenclature: M - sum of the patterns length, N - length of the corpus, K - alphabet size, T - number of coincidences + +std::unordered_map-based version: +Time: O(M + N + T), Memory: O(M) +std::map-based version: +Time: O((M + N)log(K) + T), Memory: O(M). + +[heading Exception Safety] + +Both the object-oriented and procedural versions of the Aho-Corasick algorithm take all their parameters by value(exclude output container, taked by non-const reference). Therefore, both interfaces provide the strong exception guarantee. + +[heading Notes] + +* When using the object-based interface, the pattern must remain unchanged for during the inserting. + +* The Aho-Corasick algorithm requires forward iterators for patterns and random-access iterators for the corpus. + +[heading Customization points] + +For using Aho-Corasick algorithms you must use your own Callback(RAIterator, RAIterator) -> bool. This Callback must returns true if all is fine, otherwise false. + +In Aho_Corasick_HashMap and aho_corasick_hashmap() you can customize: value type, hash and compare functions. + +In Aho_Corasick_Map and aho_corasick_map() you can customize: value type and predicate. + +In AhoCorasick you can customize: value type, type of container and any other template parameters. It container will be used in nodes of the trie. Defining of the container: Container, Args...>. So your other template parameters will be used as Args... . Also your container must support 'find' method. + +[endsect] + +[/ File aho_corasick.qbk +Copyright 2016 Alexander Zaitsev +Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). +] + diff --git a/doc/algorithm.qbk b/doc/algorithm.qbk index 1568fb50e..014740ec8 100644 --- a/doc/algorithm.qbk +++ b/doc/algorithm.qbk @@ -41,6 +41,7 @@ Thanks to all the people who have reviewed this library and made suggestions for [section:Searching Searching Algorithms] +[include aho_corasick.qbk] [include boyer_moore.qbk] [include boyer_moore_horspool.qbk] [include knuth_morris_pratt.qbk] diff --git a/example/Jamfile.v2 b/example/Jamfile.v2 index ce067cfeb..b1d937d8f 100644 --- a/example/Jamfile.v2 +++ b/example/Jamfile.v2 @@ -20,5 +20,6 @@ project /boost/algorithm/example exe clamp_example : clamp_example.cpp ; exe search_example : search_example.cpp ; -exe is_palindrome_example : is_palindrome_example.cpp; +exe is_palindrome_example : is_palindrome_example.cpp ; +exe aho_corasick_example : aho_corasick_example.cpp ; diff --git a/example/aho_corasick_example.cpp b/example/aho_corasick_example.cpp new file mode 100644 index 000000000..d2bc6e283 --- /dev/null +++ b/example/aho_corasick_example.cpp @@ -0,0 +1,41 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + + Distributed under the Boost Software License, Version 1.0. (See accompanying + file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + + For more information, see http://www.boost.org +*/ + +#include +#include +#include + +#include + + +int main() +{ + std::vector pat({"228", "he", "is", "1488", "she", "his", "322", "her", + "h", "hishera", "azaza"}); + std::string corp = "hisher"; + std::vector> out; + + bool result = boost::algorithm::aho_corasick_map(corp.begin(), corp.end(), pat.begin(), pat.end(), + [&out](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { out.push_back({begin, end}); return true; }); + + std::cout << result << std::endl; + for(const auto& val: out) + { + auto begin = val.first; + auto end = val.second; + while (begin != end) + { + std::cout << *begin; + ++begin; + } + std::cout << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/include/boost/algorithm/is_palindrome.hpp b/include/boost/algorithm/is_palindrome.hpp index 61acbae2e..217657013 100644 --- a/include/boost/algorithm/is_palindrome.hpp +++ b/include/boost/algorithm/is_palindrome.hpp @@ -61,7 +61,7 @@ bool is_palindrome(BidirectionalIterator begin, BidirectionalIterator end, Predi /// \return true if the entire sequence is palindrome /// /// \param begin The start of the input sequence -/// \param end One past the end of the input sequence +/// \param end One past the end of the input sequence /// /// \note This function will return true for empty sequences and for palindromes. /// For other sequences function will return false. diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp new file mode 100644 index 000000000..fb0b7ef19 --- /dev/null +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -0,0 +1,258 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_AHO_CORASICK_HPP +#define BOOST_ALGORITHM_AHO_CORASICK_HPP + +#include +#include +#include + +#include +#include +#include + + +namespace boost { namespace algorithm { + +template class Container, typename ...Args> +class aho_corasick_base +{ +private: + class node + { + public: + Container links; + node *fail, *term; + std::vector pat; + + node(node* fail_node = nullptr) + : fail(fail_node), term(nullptr) + { } + + node* getLink(const T& c) + { + auto iter = links.find(c); + return iter != links.end() ? &iter->second : nullptr; + } + + bool isTerminal() const + { + return !pat.empty(); + } + }; +public: + using value_type = T; + using node_type = node; +private: + node_type root; + node_type* current_state; + bool isInited = false; +public: + aho_corasick_base(){} + + template + explicit aho_corasick_base(ForwardIterator patBegin, ForwardIterator patEnd) + { + while(patBegin != patEnd) + { + insert(*patBegin); + ++patBegin; + } + } + + template + explicit aho_corasick_base(const Range& range) : aho_corasick_base(boost::begin(range), boost::end(range)) {} + + /// \fn insert(const Range& range) + /// \brief Insert pattern in trie + /// + /// \param range The pattern range + /// + template + void insert(const Range& range) + { + insert(boost::begin(range), boost::end(range)); + } + + /// \fn insert(ForwardIterator begin, ForwardIterator end) + /// \brief Insert pattern in trie + /// + /// \param begin The start of the pattern + /// \param end One past the end of the pattern + /// + template + void insert(ForwardIterator begin, ForwardIterator end) + { + isInited = false; + size_t patLen = 0; + node_type* current_node = &root; + for(auto it = begin; it != end; ++it) + { + ++patLen; + node_type* child_node = current_node->getLink(*it); + if (!child_node) + { + current_node->links[*it] = node(&root); + child_node = ¤t_node->links[*it]; + } + current_node = child_node; + } + current_node->pat.push_back(patLen); + } + + /// \fn find ( const Range& range, Callback cb) + /// \brief Searches patterns in the corpus + /// \return true if all callback callings return true, else false + /// + /// \param range The range of the data to search + /// \param cb Callback for matches + /// + template + bool find(const Range& range, Callback cb) + { + return find(boost::begin(range), boost::end(range), cb); + } + + /// \fn find ( RAIterator begin, RAIterator end, Callback cb) + /// \brief Searches patterns in the corpus + /// \return true if all callback callings return true, else false + /// + /// \param begin The start of the data to search (Random Access Iterator) + /// \param end One past the end of the data to search (Random Access Iterator) + /// \param cb Callback for matches + /// + template + bool find(RAIterator begin, RAIterator end, Callback cb) + { + if(!isInited) + { + init(); + } + current_state = &root; + for(auto it = begin; it != end; ++it) + { + step(*it); + if(!getTermsForCurrentState(it, cb)) + { + return false; + } + } + return true; + } +private: + void init() + { + std::queue q; + q.push(&root); + while (!q.empty()) + { + node_type* current_node = q.front(); + q.pop(); + for (auto iter = current_node->links.begin(); + iter != current_node->links.end(); ++iter) + { + const value_type& symbol = iter->first; + node_type* child = &iter->second; + + // Defining .fail for the childnode + node_type* temp_node = current_node->fail; + while (temp_node) + { + node_type* fail_candidate = temp_node->getLink(symbol); + if (fail_candidate) + { + child->fail = fail_candidate; + break; + } + temp_node = temp_node->fail; + } + + // Defining .term for the childnode using .term of current node + child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; + q.push(child); + } + } + isInited = true; + } + + void step(const value_type& c) + { + while (current_state) + { + node_type* candidate = current_state->getLink(c); + if (candidate) + { + current_state = candidate; + return; + } + current_state = current_state->fail; + } + current_state = &root; + } + + template + bool getTermsForCurrentState(RAIterator pos, Callback cb) + { + if (current_state->isTerminal()) + { + for (const auto value : current_state->pat) + { + if(!cb(1 + pos - value, pos + 1)) + { + return false; + } + } + } + node_type* temp_node = current_state->term; + while (temp_node) + { + for (const auto value : temp_node->pat) + { + if(!cb(1 + pos - value, pos + 1)) + { + return false; + } + } + temp_node = temp_node->term; + } + return true; + } +}; + +//Object interface +template > +using aho_corasick = aho_corasick_base; + + +//Functional interface + +/// \fn aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, +/// ForwardIterator pat_begin, ForwardIterator pat_end, +/// Callback cb) +/// \return true if all callback callings return true, else false +/// +/// \param corpus_begin The start of the corpus sequence +/// \param corpus_end One past the end of the corpus sequence +/// \param pat_begin The start of the patterns sequence +/// \param pat_end One past the end of the patterns sequence +/// \param cb Callback for matches +/// +template , typename RAIterator, + typename ForwardIterator, typename Callback> +bool aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, + ForwardIterator pat_begin, ForwardIterator pat_end, + Callback cb) +{ + aho_corasick_base obj(pat_begin, pat_end); + return obj.find(corpus_begin, corpus_end, cb); +} + +}} + +#endif //BOOST_ALGORITHM_AHO_CORASICK_HPP diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index fb00843ad..8360cabdb 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -26,6 +26,7 @@ alias unit_test_framework [ compile-fail search_fail1.cpp : : : : ] [ compile-fail search_fail2.cpp : : : : ] [ compile-fail search_fail3.cpp : : : : ] + [ run aho_corasick_test.cpp aho_corasick_test : : : : aho_corasick_test ] # Misc tests [ run clamp_test.cpp unit_test_framework : : : : clamp_test ] diff --git a/test/aho_corasick_test.cpp b/test/aho_corasick_test.cpp new file mode 100644 index 000000000..98c92cc59 --- /dev/null +++ b/test/aho_corasick_test.cpp @@ -0,0 +1,122 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#include +#include + +#define BOOST_TEST_MAIN +#include + +#include +#include +#include +#include +#include + + +namespace ba = boost::algorithm; +const std::vector> patterns({std::vector({"he", "is", "she", "his", "her", "h", "hishera", "azaza"}), + std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"}), + std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"})}); + +const std::vector corpus({"hisher", + "usher and she he her", + ""}); + +const std::vector> rightResults({std::vector({"h", "his", "is", "h", "she", "he", "her"}), + std::vector({"she", "he", "he", "usher", "her", "d sh", "she", + "he", "he", "he", "he", "he", "he", "her"}), + std::vector()});; +template +void fromIteratorsToContainer(const Cont1& from, Cont2& to) +{ + for (const auto &val: from) + { + T str; + auto begin = val.first; + auto end = val.second; + while (begin != end) + { + str += *begin; + ++begin; + } + to.push_back(std::move(str)); + } +} + +void test_aho_corasick() +{ + BOOST_CHECK(patterns.size() == corpus.size()); + //aho_corasick_map + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::aho_corasick_map(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //aho_corasick_hashmap + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::aho_corasick_hashmap(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //Aho_Corasick_Map + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::Aho_Corasick_Map obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //Aho_Corasick_HashMap + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::Aho_Corasick_HashMap obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //General AhoCorasick + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::AhoCorasick, std::equal_to> obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } +} + +BOOST_AUTO_TEST_CASE( test_main ) +{ + test_aho_corasick(); +} \ No newline at end of file