-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented Ukkonens Approximate String Matching algorithm
- Loading branch information
0 parents
commit 03b3a1e
Showing
11 changed files
with
588 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
; top-most EditorConfig file | ||
root = true | ||
|
||
[*] | ||
charset = utf-8 | ||
|
||
trim_trailing_whitespace = true | ||
|
||
; Unix style line endings | ||
end_of_line = lf | ||
|
||
; Always end file on newline | ||
insert_final_newline = true | ||
|
||
; Indentation | ||
indent_style = space | ||
indent_size = 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
save-prefix='' | ||
package-lock=false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Copyright (c) 2017 Sune Simonsen <sune@we-knowhow.dk> | ||
|
||
Permission is hereby granted, free of charge, to any person | ||
obtaining a copy of this software and associated documentation | ||
files (the 'Software'), to deal in the Software without | ||
restriction, including without limitation the rights to use, copy, | ||
modify, merge, publish, distribute, sublicense, and/or sell copies | ||
of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be | ||
included in all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, | ||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Ukkonen - Approximate String Matching | ||
|
||
This project implements the [Approximate String Matching algorithm by Esko Ukkonen](https://www.sciencedirect.com/science/article/pii/S0019995885800462) extended with ideas from [An Extension of Ukkonen's Enhanced Dynamic Programming ASM Algorith by Hal Berghel and David Roach](http://berghel.net/publications/asm/asm.pdf). | ||
|
||
Ukkonen's algorithm is very competitive with the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) and for longer strings it is much more performant than Levenshtein distance. | ||
|
||
In addition to being a competitive alternative to Levenshtein distance, Ukkonen's algorithm also allows you to provide a threshold for the distance which increases the performance even more for texts that are longer than the threshold. | ||
|
||
## Install | ||
|
||
```sh | ||
npm install --save ukkonen | ||
``` | ||
|
||
## Usage | ||
|
||
```js | ||
var ukkonen = require('ukkonen') | ||
|
||
assert.equal(ukkonen('Ukkonen', 'Levenshtein'), 8) | ||
``` | ||
|
||
## Platform support | ||
|
||
The library is ES5 and will work with any JavaScript bundler in the browser as well as Node versions with ES5 support. | ||
|
||
## Benchmark | ||
|
||
I have benchmarked the library against [the fastest Levenshtein distance implementation on NPM](https://github.com/sindresorhus/leven). | ||
|
||
``` | ||
Edit distance one word | ||
245,499 op/s » ukkonen | ||
502,333 op/s » leven | ||
Edit distance on sentence with small differences | ||
767,359 op/s » ukkonen | ||
139,628 op/s » leven | ||
Edit distance on paragraphs with small differences | ||
237,857 op/s » ukkonen | ||
2,670 op/s » leven | ||
Edit distance on longer texts with small differences | ||
112,547 op/s » ukkonen | ||
683 op/s » leven | ||
Edit distance on longer texts with many differences | ||
372 op/s » ukkonen | ||
416 op/s » leven | ||
Edit distance on longer texts with small differences and a threshold of 10 | ||
127,725 op/s » ukkonen | ||
678 op/s » leven | ||
Edit distance on longer texts with many differences and a threshold of 40 | ||
84,959 op/s » ukkonen | ||
425 op/s » leven | ||
``` | ||
|
||
## Acknowledgements | ||
|
||
Obviously the authors of the papers describing the algorithm Esko Ukkonen, Hal Berghel and David Roach. | ||
|
||
I stole a lot of ideas from [Sindre Sorhus](https://github.com/sindresorhus)'s [leven](https://github.com/sindresorhus/leven) library and I also used it to test my implementation against. | ||
|
||
## License | ||
|
||
[MIT © Sune Simonsen](./LICENSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
/* globals bench suite */ | ||
'use strict'; | ||
|
||
var ukkonen = require('./'); | ||
var leven = require('leven'); | ||
|
||
suite('Edit distance one word', function () { | ||
function run(fn) { | ||
fn('a', 'b'); | ||
fn('ab', 'ac'); | ||
fn('ac', 'bc'); | ||
fn('abc', 'axc'); | ||
fn('kitten', 'sitting'); | ||
fn('xabxcdxxefxgx', '1ab2cd34ef5g6'); | ||
fn('cat', 'cow'); | ||
fn('xabxcdxxefxgx', 'abcdefg'); | ||
fn('javawasneat', 'scalaisgreat'); | ||
fn('example', 'samples'); | ||
fn('sturgeon', 'urgently'); | ||
fn('levenshtein', 'frankenstein'); | ||
fn('distance', 'difference'); | ||
fn('因為我是中國人所以我會說中文', '因為我是英國人所以我會說英文'); | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on sentence with small differences', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem Ipsum is simply dummy text of the printing and typesetting industry.', | ||
'Lorem Ipsum is simply clever text of the printing and typesetting industries.' | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on paragraphs with small differences', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.', | ||
'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1600s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1970s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker excluding versions of Lorem Ipsum.' | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on longer texts with small differences', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, eu placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Etiam lacinia pretium luctus. Mauris nulla turpis, suscipit vitae lobortis quis, tempor sed ex. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.', | ||
'Lorem Ipsum dolor sit amet, consectetur elit adipiscing. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Etiam lacinia pretium luctus. Mauris nulla turpis, suscipit vitae lobortis quis, tempor sed ex. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis Vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.' | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on longer texts with many differences', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, eu placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Etiam lacinia pretium luctus. Mauris nulla turpis, suscipit vitae lobortis quis, tempor sed ex. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.', | ||
'Curabitur fringilla eros lacus, et placerat magna pretium in. Suspendisse ut egestas dui. Nam quis sapien eget enim interdum interdum. Phasellus metus ligula, lacinia at tellus eu, iaculis blandit libero. Proin risus sem, ornare a orci et, aliquam rutrum elit. Aenean ac posuere justo, a maximus orci. In molestie nibh quis libero elementum, vel pellentesque metus volutpat. Maecenas non quam felis. Proin congue aliquet mauris laoreet viverra. Fusce auctor sapien a neque varius pellentesque. Nam ut sem neque. Pellentesque bibendum aliquet consectetur. Nam finibus diam non vestibulum maximus. Integer aliquet mattis elit, vitae vehicula erat pulvinar at. Ut placerat viverra aliquam. Nulla vehicula hendrerit justo.' | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on longer texts with small differences and a threshold of 10', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, eu placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Etiam lacinia pretium luctus. Mauris nulla turpis, suscipit vitae lobortis quis, tempor sed ex. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.', | ||
'Lorem Ipsum dolor sit amet, consectetur elit adipiscing. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Etiam lacinia pretium luctus. Mauris nulla turpis, suscipit vitae lobortis quis, tempor sed ex. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis Vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.', | ||
20 | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) | ||
|
||
suite('Edit distance on longer texts with many differences and a threshold of 40', function () { | ||
function run(fn) { | ||
fn( | ||
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras tellus sapien, rhoncus sed bibendum in, facilisis non urna. Cras non mattis tellus, nec facilisis nisi. Proin vel purus eros. Morbi ultrices egestas mi vitae laoreet. Ut feugiat est lorem, a rhoncus mi lacinia vel. Aenean et velit neque. Quisque accumsan mi ligula, eu placerat lorem elementum ac. Nunc congue, eros eu aliquam commodo, leo orci tristique nulla, eu tempus quam justo eu neque. Nulla purus elit, porttitor ut sollicitudin sed, dictum vel justo. Mauris orci nisi, lacinia dictum augue nec, condimentum suscipit metus. Sed elementum enim eget venenatis mollis. Etiam sed congue neque, id tristique ex. Duis vitae ipsum nec ligula vulputate ullamcorper. Phasellus fringilla odio turpis, eu condimentum turpis scelerisque quis.', | ||
'Curabitur fringilla eros lacus, et placerat magna pretium in. Suspendisse ut egestas dui. Nam quis sapien eget enim interdum interdum. Phasellus metus ligula, lacinia at tellus eu, iaculis blandit libero. Proin risus sem, ornare a orci et, aliquam rutrum elit. Aenean ac posuere justo, a maximus orci. In molestie nibh quis libero elementum, vel pellentesque metus volutpat. Maecenas non quam felis. Proin congue aliquet mauris laoreet viverra. Fusce auctor sapien a neque varius pellentesque. Nam ut sem neque. Pellentesque bibendum aliquet consectetur. Nam finibus diam non vestibulum maximus. Integer aliquet mattis elit, vitae vehicula erat pulvinar at. Ut placerat viverra aliquam. Nulla vehicula hendrerit justo. Contrary to popular belief, Lorem Ipsum is not simply random text.', | ||
40 | ||
) | ||
} | ||
|
||
bench('ukkonen', function () { | ||
run(ukkonen); | ||
}); | ||
|
||
bench('leven', function () { | ||
run(leven); | ||
}); | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
function ukkonen(a, b, threshold) { | ||
if (a === b) { | ||
return 0; | ||
} | ||
|
||
threshold = typeof threshold === "number" ? threshold : Infinity; | ||
|
||
if (a.length > b.length) { | ||
// Swap a and b so b longer or same length as a | ||
var tmp = a; | ||
a = b; | ||
b = tmp; | ||
} | ||
|
||
var aLen = a.length; | ||
var bLen = b.length; | ||
|
||
// Performing suffix trimming: | ||
// We can linearly drop suffix common to both strings since they | ||
// don't increase distance at all | ||
// Note: `~-` is the bitwise way to perform a `- 1` operation | ||
while (aLen > 0 && a.charCodeAt(~-aLen) === b.charCodeAt(~-bLen)) { | ||
aLen--; | ||
bLen--; | ||
} | ||
|
||
if (aLen === 0) { | ||
return bLen < threshold ? bLen : threshold; | ||
} | ||
|
||
// Performing prefix trimming | ||
// We can linearly drop prefix common to both strings since they | ||
// don't increase distance at all | ||
var tStart = 0; | ||
while (tStart < aLen && a.charCodeAt(tStart) === b.charCodeAt(tStart)) { | ||
tStart++; | ||
} | ||
|
||
aLen -= tStart; | ||
bLen -= tStart; | ||
|
||
if (aLen === 0) { | ||
return bLen < threshold ? bLen : threshold; | ||
} | ||
|
||
threshold = bLen < threshold ? bLen : threshold; | ||
|
||
var dLen = bLen - aLen; | ||
|
||
if (threshold < dLen) { | ||
return threshold; | ||
} | ||
|
||
// floor(min(threshold, aLen) / 2)) + 2 | ||
var ZERO_K = ((aLen < threshold ? aLen : threshold) >> 1) + 2; | ||
|
||
var arrayLength = dLen + ZERO_K * 2 + 2; | ||
var currentRow = new Array(arrayLength); | ||
var nextRow = new Array(arrayLength); | ||
for (var i = 0; i < arrayLength; i++) { | ||
currentRow[i] = -Infinity; | ||
nextRow[i] = -Infinity; | ||
} | ||
|
||
var aCharCodes = new Array(aLen); | ||
var bCharCodes = new Array(bLen); | ||
|
||
for (var i = 0, t = tStart; i < aLen; i++, t++) { | ||
aCharCodes[i] = a.charCodeAt(t); | ||
bCharCodes[i] = b.charCodeAt(t); | ||
} | ||
|
||
while (i < bLen) { | ||
bCharCodes[i++] = b.charCodeAt(t++); | ||
} | ||
|
||
var i = 0; | ||
var conditionRow = dLen + ZERO_K; | ||
var endMax = conditionRow << 1; | ||
do { | ||
i++; | ||
|
||
var tmp = currentRow; | ||
currentRow = nextRow; | ||
nextRow = tmp; | ||
|
||
var start; | ||
var previousCell; | ||
var currentCell = -Infinity; | ||
var nextCell; | ||
|
||
if (i <= ZERO_K) { | ||
start = -i + 1; | ||
nextCell = i - 2; | ||
} else { | ||
start = i - (ZERO_K << 1) + 1; | ||
nextCell = currentRow[ZERO_K + start]; | ||
} | ||
|
||
var end; | ||
if (i <= conditionRow) { | ||
end = i; | ||
nextRow[ZERO_K + i] = -1; | ||
} else { | ||
end = endMax - i; | ||
} | ||
|
||
for (var k = start, rowIndex = start + ZERO_K; k < end; k++, rowIndex++) { | ||
previousCell = currentCell; | ||
currentCell = nextCell; | ||
nextCell = currentRow[rowIndex + 1]; | ||
|
||
// max(t, previousCell, nextCell + 1) | ||
var t = currentCell + 1; | ||
t = t < previousCell ? previousCell : t; | ||
t = t < nextCell + 1 ? nextCell + 1 : t; | ||
|
||
while (t < aLen && t + k < bLen && aCharCodes[t] === bCharCodes[t + k]) { | ||
t++; | ||
} | ||
|
||
nextRow[rowIndex] = t; | ||
} | ||
} while (nextRow[conditionRow] < aLen && i <= threshold); | ||
|
||
return i - 1; | ||
} | ||
|
||
module.exports = ukkonen; |
Oops, something went wrong.