diff --git a/AhoCorasick.Tests.cs b/AhoCorasick.Tests.cs index 3e92ad4..16c1ebe 100644 --- a/AhoCorasick.Tests.cs +++ b/AhoCorasick.Tests.cs @@ -1,89 +1,109 @@ -// Copyright (c) 2013 Pēteris Ņikiforovs -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -using System.Linq; - -using NUnit.Framework; - -namespace AhoCorasick -{ - public class Tests - { - [Test] - public void HelloWorld() - { - string text = "hello and welcome to this beautiful world!"; - - AhoCorasick.Trie trie = new AhoCorasick.Trie(); - trie.Add("hello"); - trie.Add("world"); - trie.Build(); - - string[] matches = trie.Find(text).ToArray(); - - Assert.AreEqual(2, matches.Length); - Assert.AreEqual("hello", matches[0]); - Assert.AreEqual("world", matches[1]); - } - - [Test] - public void Contains() - { - string text = "hello and welcome to this beautiful world!"; - - AhoCorasick.Trie trie = new AhoCorasick.Trie(); - trie.Add("hello"); - trie.Add("world"); - trie.Build(); - - Assert.IsTrue(trie.Find(text).Any()); - } - - [Test] - public void LineNumbers() - { - string text = "world, i hello you!"; - string[] words = new[] { "hello", "world" }; - - AhoCorasick.Trie trie = new AhoCorasick.Trie(); - for (int i = 0; i < words.Length; i++) - trie.Add(words[i], i); - trie.Build(); - - int[] lines = trie.Find(text).ToArray(); - - Assert.AreEqual(2, lines.Length); - Assert.AreEqual(1, lines[0]); - Assert.AreEqual(0, lines[1]); - } - - [Test] - public void Words() - { - string[] text = "one two three four".Split(' '); - - AhoCorasick.Trie trie = new AhoCorasick.Trie(); - trie.Add(new[] { "three", "four" }, true); - trie.Build(); - - Assert.IsTrue(trie.Find(text).Any()); - } - } -} +// Copyright (c) 2013 Pēteris Ņikiforovs +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +using System; +using System.Linq; + +using NUnit.Framework; + +namespace AhoCorasick +{ + public class Tests + { + [Test] + public void HelloWorld() + { + string text = "hello and welcome to this beautiful world!"; + + var trie = new AhoCorasick.Trie(); + trie.Add("hello"); + trie.Add("world"); + trie.Build(); + + var matches = trie.Find(text).ToArray(); + + Assert.AreEqual(2, matches.Length); + Assert.AreEqual(Tuple.Create("hello", 4), matches[0]); + Assert.AreEqual(Tuple.Create("world", 40), matches[1]); + } + + [Test] + public void Contains() + { + string text = "hello and welcome to this beautiful world!"; + + var trie = new AhoCorasick.Trie(); + trie.Add("hello"); + trie.Add("world"); + trie.Build(); + + Assert.IsTrue(trie.Find(text).Any()); + } + + [Test] + public void Ids() + { + string text = "hello and welcome to this beautiful world!"; + + var trie = new AhoCorasick.Trie(); + trie.Add("hello", 123); + trie.Add("world", 456); + + trie.Build(); + + var matches = trie.Find(text).ToArray(); + + Assert.AreEqual(2, matches.Length); + Assert.AreEqual(Tuple.Create(123, 4), matches[0]); + Assert.AreEqual(Tuple.Create(456, 40), matches[1]); + } + + [Test] + public void WordsAndIds() + { + string text = "hello and welcome to this beautiful world!"; + + var trie = new AhoCorasick.Trie>(); + + trie.Add("hello", Tuple.Create("hello", 123)); + trie.Add("world", Tuple.Create("world", 456)); + + trie.Build(); + + var matches = trie.Find(text).ToArray(); + + Assert.AreEqual(2, matches.Length); + Assert.AreEqual(Tuple.Create(Tuple.Create("hello", 123), 4), matches[0]); + Assert.AreEqual(Tuple.Create(Tuple.Create("world", 456), 40), matches[1]); + } + + [Test] + public void Words() + { + string[] text = "one two three four".Split(' '); + + var trie = new AhoCorasick.Trie(); + trie.Add(new[] { "three", "four" }, true); + trie.Build(); + + Assert.IsTrue(trie.Find(text).Any()); + } + } +} diff --git a/AhoCorasick.cs b/AhoCorasick.cs index e2e2705..0cd20b2 100644 --- a/AhoCorasick.cs +++ b/AhoCorasick.cs @@ -1,265 +1,254 @@ -// Copyright (c) 2013 Pēteris Ņikiforovs -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -using System.Collections; -using System.Collections.Generic; - -namespace AhoCorasick -{ - /// - /// Trie that will find and return strings found in a text. - /// - public class Trie : Trie - { - /// - /// Adds a string. - /// - /// The string to add. - public void Add(string s) - { - Add(s, s); - } - - /// - /// Adds multiple strings. - /// - /// The strings to add. - public void Add(IEnumerable strings) - { - foreach (string s in strings) - { - Add(s); - } - } - } - - /// - /// Trie that will find strings in a text and return values of type - /// for each string found. - /// - /// Value type. - public class Trie : Trie - { - } - - /// - /// Trie that will find strings or phrases and return values of type - /// for each string or phrase found. - /// - /// - /// will typically be a char for finding strings - /// or a string for finding phrases or whole words. - /// - /// The type of a letter in a word. - /// The type of the value that will be returned when the word is found. - public class Trie - { - /// - /// Root of the trie. It has no value and no parent. - /// - private readonly Node root = new Node(); - - /// - /// Adds a word to the tree. - /// - /// - /// A word consists of letters. A node is built for each letter. - /// If the letter type is char, then the word will be a string, since it consists of letters. - /// But a letter could also be a string which means that a node will be added - /// for each word and so the word is actually a phrase. - /// - /// The word that will be searched. - /// The value that will be returned when the word is found. - public void Add(IEnumerable word, TValue value) - { - // start at the root - var node = root; - - // build a branch for the word, one letter at a time - // if a letter node doesn't exist, add it - foreach (T c in word) - { - var child = node[c]; - - if (child == null) - child = node[c] = new Node(c, node); - - node = child; - } - - // mark the end of the branch - // by adding a value that will be returned when this word is found in a text - node.Values.Add(value); - } - - - /// - /// Constructs fail or fall links. - /// - public void Build() - { - // construction is done using breadth-first-search - var queue = new Queue>(); - queue.Enqueue(root); - - while (queue.Count > 0) - { - var node = queue.Dequeue(); - - // visit children - foreach (var child in node) - queue.Enqueue(child); - - // fail link of root is root - if (node == root) - { - root.Fail = root; - continue; - } - - var fail = node.Parent.Fail; - - while (fail[node.Word] == null && fail != root) - fail = fail.Fail; - - node.Fail = fail[node.Word] ?? root; - if (node.Fail == node) - node.Fail = root; - } - } - - /// - /// Finds all added words in a text. - /// - /// The text to search in. - /// The values that were added for the found words. - public IEnumerable Find(IEnumerable text) - { - var node = root; - - foreach (T c in text) - { - while (node[c] == null && node != root) - node = node.Fail; - - node = node[c] ?? root; - - for (var t = node; t != root; t = t.Fail) - { - foreach (TValue value in t.Values) - yield return value; - } - } - } - - /// - /// Node in a trie. - /// - /// The same as the parent type. - /// The same as the parent value type. - private class Node : IEnumerable> - { - private readonly TNode word; - private readonly Node parent; - private readonly Dictionary> children = new Dictionary>(); - private readonly List values = new List(); - - /// - /// Constructor for the root node. - /// - public Node() - { - } - - /// - /// Constructor for a node with a word - /// - /// - /// - public Node(TNode word, Node parent) - { - this.word = word; - this.parent = parent; - } - - /// - /// Word (or letter) for this node. - /// - public TNode Word - { - get { return word; } - } - - /// - /// Parent node. - /// - public Node Parent - { - get { return parent; } - } - - /// - /// Fail or fall node. - /// - public Node Fail - { - get; - set; - } - - /// - /// Children for this node. - /// - /// Child word. - /// Child node. - public Node this[TNode c] - { - get { return children.ContainsKey(c) ? children[c] : null; } - set { children[c] = value; } - } - - /// - /// Values for words that end at this node. - /// - public List Values - { - get { return values; } - } - - /// - public IEnumerator> GetEnumerator() - { - return children.Values.GetEnumerator(); - } - - /// - IEnumerator IEnumerable.GetEnumerator() - { - return GetEnumerator(); - } - - /// - public override string ToString() - { - return Word.ToString(); - } - } - } -} +// Copyright (c) 2013 Pēteris Ņikiforovs +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +using System; +using System.Collections; +using System.Collections.Generic; + +namespace AhoCorasick +{ + /// + /// Trie that will find and return strings found in a text. + /// + public class Trie : Trie + { + /// + /// Adds a string. + /// + /// The string to add. + public void Add(string s) + { + Add(s, s); + } + + /// + /// Adds multiple strings. + /// + /// The strings to add. + public void Add(IEnumerable strings) + { + foreach (string s in strings) + { + Add(s); + } + } + } + + /// + /// Trie that will find strings in a text and return values of type + /// for each string found. + /// + /// Value type. + public class Trie : Trie + { + } + + /// + /// Trie that will find strings or phrases and return values of type + /// for each string or phrase found. + /// + /// + /// will typically be a char for finding strings + /// or a string for finding phrases or whole words. + /// + /// The type of a letter in a word. + /// The type of the value that will be returned when the word is found. + public class Trie + { + public int Count { get; private set; } = 0; + + /// + /// Root of the trie. It has no value and no parent. + /// + private readonly Node _root = new Node(); + + /// + /// Adds a word to the tree. + /// + /// + /// A word consists of letters. A node is built for each letter. + /// If the letter type is char, then the word will be a string, since it consists of letters. + /// But a letter could also be a string which means that a node will be added + /// for each word and so the word is actually a phrase. + /// + /// The word that will be searched. + /// The value that will be returned when the word is found. + public void Add(IEnumerable word, TValue value) + { + // start at the root + var node = _root; + + // build a branch for the word, one letter at a time + // if a letter node doesn't exist, add it + foreach (T c in word) + { + var child = node[c] ?? (node[c] = new Node(c, node)); + + node = child; + } + + // mark the end of the branch + // by adding a value that will be returned when this word is found in a text + node.Values.Add(value); + + ++Count; + } + + /// + /// Constructs fail or fall links. + /// + public void Build() + { + // construction is done using breadth-first-search + var queue = new Queue>(); + queue.Enqueue(_root); + + while (queue.Count > 0) + { + var node = queue.Dequeue(); + + // visit children + foreach (var child in node) + queue.Enqueue(child); + + // fail link of root is root + if (node == _root) + { + _root.Fail = _root; + continue; + } + + var fail = node.Parent.Fail; + + while (fail[node.Word] == null && fail != _root) + fail = fail.Fail; + + node.Fail = fail[node.Word] ?? _root; + if (node.Fail == node) + node.Fail = _root; + } + } + + /// + /// Finds all added words in a text. + /// + /// The text to search in. + /// The values, endIndexs that were added for the found words. + public IEnumerable> Find(IEnumerable text) + { + var node = _root; + + int endIndex = 0; + foreach (T c in text) + { + while (node[c] == null && node != _root) + node = node.Fail; + + node = node[c] ?? _root; + + for (var t = node; t != _root; t = t.Fail) + { + foreach (TValue value in t.Values) + yield return new Tuple(value, endIndex); + } + + ++endIndex; + } + } + + /// + /// Node in a trie. + /// + /// The same as the parent type. + /// The same as the parent value type. + private class Node : IEnumerable> + { + private readonly Dictionary> _children = + new Dictionary>(); + + /// + /// Constructor for the root node. + /// + public Node() + { + } + + /// + /// Constructor for a node with a word + /// + /// + /// + public Node(TNode word, Node parent) + { + this.Word = word; + this.Parent = parent; + } + + /// + /// Word (or letter) for this node. + /// + public TNode Word { get; } + + /// + /// Parent node. + /// + public Node Parent { get; } + + /// + /// Fail or fall node. + /// + public Node Fail { get; set; } + + /// + /// Children for this node. + /// + /// Child word. + /// Child node. + public Node this[TNode c] + { + get { return _children.ContainsKey(c) ? _children[c] : null; } + set { _children[c] = value; } + } + + /// + /// Values for words that end at this node. + /// + public List Values { get; } = new List(); + + /// + public IEnumerator> GetEnumerator() + { + return _children.Values.GetEnumerator(); + } + + /// + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + /// + public override string ToString() + { + return Word.ToString(); + } + } + } +} diff --git a/README.md b/README.md index 1e86ca0..6c63305 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,85 @@ -Aho–Corasick string matching algorithm in C# -============================================ - -The [Aho–Corasick string matching algorithm](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) is a string searching algorithm. It's useful in NLP when you have a dictionary with words and you need to tell if a text contains any of the words. - -```csharp -AhoCorasick.Trie trie = new AhoCorasick.Trie(); - -// add words -trie.Add("hello"); -trie.Add("world"); - -// build search tree -trie.Build(); - -string text = "hello and welcome to this beautiful world!"; - -// find words -foreach (string word in trie.Find(text)) { - Console.WriteLine(word); -} -``` - -You can associate other data with the words (like an ID or line number). - -```csharp -AhoCorasick.Trie trie = new AhoCorasick.Trie(); - -// add words -trie.Add("hello", 123); -trie.Add("world", 456); - -// build search tree -trie.Build(); - -// retrieve IDs -foreach (int id in trie.Find(text)) { - Console.WriteLine(id); -} -``` - -Use `IEnumerable.Any()` to check if the text contains a match without retrieving all of them. - -If you want to match whole words, you can use `Trie`. - -```csharp -string[] text = "hello world i say to you".Split(' '); - -AhoCorasick.Trie trie = new AhoCorasick.Trie(); -trie.Add("hello world".Split(' '), true); -trie.Build(); -bool containsHelloWorld = trie.Find(text).Any(); -``` - -License -------- - +Aho–Corasick string matching algorithm in C# +============================================ + +The [Aho–Corasick string matching algorithm](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) is a string searching algorithm. It's useful in NLP when you have a dictionary with words and you need to tell if a text contains any of the words. + +```csharp +AhoCorasick.Trie trie = new AhoCorasick.Trie(); + +// add words +trie.Add("hello"); +trie.Add("world"); + +// build search tree +trie.Build(); + +string text = "hello and welcome to this beautiful world!"; + +// find words and wordEndIndices +foreach (Tuple tuple in trie.Find(text)) { + var word = tuple.Item1; + var wordEndIndex = tuple.Item2; + Console.WriteLine("{0}, {1}", word, wordEndIndex); +} +``` + +You could associate other data with the words (like an ID or line number). + +```csharp +AhoCorasick.Trie trie = new AhoCorasick.Trie(); + +// add words +trie.Add("hello", 123); +trie.Add("world", 456); + +// build search tree +trie.Build(); + +// retrieve IDs and wordEndIndices +foreach (Tuple tuple in trie.Find(text)) +{ + var id = tuple.Item1; + var wordEndIndex = tuple.Item2; + Console.WriteLine("{0}, {1}", id, wordEndIndex); +} +``` + +You also could retrieve matched strings and associated data (like an ID or line number) + +```csharp +AhoCorasick.Trie> trie = new AhoCorasick.Trie(); + +// add words +trie.Add("hello", new Tuple("hello", 123)); +trie.Add("world", new Tuple("world", 456)); + +// build search tree +trie.Build(); + +// find words, IDs and wordEndIndices +foreach (Tuple, int> tuple in trie.Find(text)) +{ + var word = tuple.Item1.Item1; + var id = tuple.Item1.Item2; + var wordEndIndex = tuple.Item2; + Console.WriteLine("{0}, {1}, {2}", word, id, wordEndIndex); +} +``` + +Use `IEnumerable.Any()` to check if the text contains a match without retrieving all of them. + +If you want to match whole words, you can use `Trie`. + +```csharp +string[] text = "hello world i say to you".Split(' '); + +AhoCorasick.Trie trie = new AhoCorasick.Trie(); +trie.Add("hello world".Split(' '), true); +trie.Build(); +bool containsHelloWorld = trie.Find(text).Any(); +``` + +License +------- + MIT \ No newline at end of file