diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts index cf3bf56d0..0ca7c1e12 100644 --- a/packages/orama/src/components/levenshtein.ts +++ b/packages/orama/src/components/levenshtein.ts @@ -23,6 +23,19 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { let lenA = a.length let lenB = b.length + // ignore common prefix + let startIdx = 0 + while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) { + startIdx++ + } + + // if string A is subfix of B, we consider the distance 0 + // because we search for prefix! + // fix https://github.com/oramasearch/orama/issues/544 + if (startIdx === lenA) { + return 0 + } + // ignore common suffix // note: `~-` decreases by a unit in a bitwise fashion while (lenA > 0 && a.charCodeAt(~-lenA) === b.charCodeAt(~-lenB)) { @@ -35,17 +48,14 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { return lenB > tolerance ? -1 : lenB } - // ignore common prefix - let startIdx = 0 - while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) { - startIdx++ - } lenA -= startIdx lenB -= startIdx - // early return when the smallest string is empty - if (lenA === 0) { - return lenB > tolerance ? -1 : lenB + // If both strings are smaller than the tolerance, we accept any distance + // Probably the result distance is wrong, but we don't care: + // It is always less then the tolerance! + if (lenA <= tolerance && lenB <= tolerance) { + return lenA > lenB ? lenA : lenB } const delta = lenB - lenA diff --git a/packages/orama/src/trees/radix.ts b/packages/orama/src/trees/radix.ts index 8b08ddb1f..139a48707 100644 --- a/packages/orama/src/trees/radix.ts +++ b/packages/orama/src/trees/radix.ts @@ -217,8 +217,7 @@ function _findLevenshtein( if (node.e) { const { w, d: docIDs } = node if (w) { - const difference = Math.abs(term.length - w.length) - if (difference <= originalTolerance && syncBoundedLevenshtein(term, w, originalTolerance).isBounded) { + if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) { output[w] = [] } if (getOwnProperty(output, w) != null && docIDs.length > 0) { @@ -268,6 +267,7 @@ export function find(root: Node, { term, exact, tolerance }: FindParams): FindRe if (tolerance && !exact) { const output: FindResult = {} tolerance = tolerance || 0 + _findLevenshtein(root, term, 0, tolerance || 0, tolerance, output) return output } else { diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts index b5db10ae7..a9e2240cc 100644 --- a/packages/orama/tests/levenshtein.test.ts +++ b/packages/orama/tests/levenshtein.test.ts @@ -29,7 +29,7 @@ t.test('levenshtein', (t) => { }) t.test('boundedLevenshtein', (t) => { - t.plan(4) + t.plan(3) t.test('should be 0 when both inputs are empty', async (t) => { t.plan(2) @@ -39,31 +39,12 @@ t.test('boundedLevenshtein', (t) => { }) t.test('should be the max input length when either strings are empty', async (t) => { - t.plan(2) + t.plan(3) - t.match(await boundedLevenshtein('', 'some', 4), { distance: 4, isBounded: true }) - t.match(await boundedLevenshtein('body', '', 4), { distance: 4, isBounded: true }) - }) + t.match(await boundedLevenshtein('', 'some', 0), { distance: 0, isBounded: true }) - t.test('distance should be the same as levenshtein, when tolerance is high enough', async (t) => { - t.plan(5) - - const tol = 15 - - t.equal(levenshtein('aa', 'b'), (await boundedLevenshtein('aa', 'b', tol)).distance) - t.equal(levenshtein('b', 'aa'), (await boundedLevenshtein('bb', 'a', tol)).distance) - t.equal( - levenshtein('somebody once', 'told me'), - (await boundedLevenshtein('somebody once', 'told me', tol)).distance - ) - t.equal( - levenshtein('the world is gonna', 'roll me'), - (await boundedLevenshtein('the world is gonna', 'roll me', tol)).distance - ) - t.equal( - levenshtein('kaushuk chadhui', 'caushik chakrabar'), - (await boundedLevenshtein('kaushuk chadhui', 'caushik chakrabar', tol)).distance - ) + t.match(await boundedLevenshtein('', 'some', 4), { distance: 0, isBounded: true }) + t.match(await boundedLevenshtein('body', '', 4), { distance: 0, isBounded: true }) }) t.test('should tell whether the Levenshtein distance is upperbounded by a given tolerance', async (t) => { @@ -73,3 +54,65 @@ t.test('boundedLevenshtein', (t) => { t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false }) }) }) + +t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp', async (t) => { + t.match(await boundedLevenshtein('Dhris', 'Chris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Chris', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 2), { isBounded: true, distance: 2 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 3), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Dhris', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cris', 2), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Dhris', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 3), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 4), { isBounded: true, distance: 4 }) + + t.match(await boundedLevenshtein('Chris', 'Chris', 0), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Chris', 1), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Chris', 2), { isBounded: true, distance: 0 }) + + t.match(await boundedLevenshtein('Chris', 'Cris', 0), { isBounded: false, distance: -1 }) + + t.match(await boundedLevenshtein('Chris', 'Cris', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Chris', 'Cris', 2), { isBounded: true, distance: 1 }) + + t.match(await boundedLevenshtein('Chris', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Craig', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Craig', 'Caig', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Craig', 'Caig', 2), { isBounded: true, distance: 1 }) + + t.match(await boundedLevenshtein('Chxy', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Chxy', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Crxy', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Cris', 2), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Crxy', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 }) + + t.end() +}) diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts index 64d8124f4..a1191c47f 100644 --- a/packages/orama/tests/search.test.ts +++ b/packages/orama/tests/search.test.ts @@ -8,7 +8,7 @@ t.test('search method', (t) => { //https://github.com/oramasearch/orama/issues/480 //following testcase pass only if issue 480 is fixed. - t.test('should correctly match with tolerance . even if prefix doesnt match.', async (t) => { + t.test('should correctly match with tolerance. even if prefix doesnt match.', async (t) => { t.plan(5) const db = await create({ @@ -38,10 +38,10 @@ t.test('search method', (t) => { //issue 480 says following will not match because the prefix "Cr" exists so prefix Ch is not searched. const result4 = await search(db, { term: 'Cris', tolerance: 1 }) + t.equal(result4.count, 1) + //should match "Craig" even if prefix "Ca" exists. const result5 = await search(db, { term: 'Caig', tolerance: 1 }) - - t.equal(result4.count, 1) t.equal(result5.count, 1) }) @@ -736,6 +736,34 @@ t.test('search method', (t) => { t.end() }) +t.test('fix-544', async t => { + const db = await create({ + schema: { + name: 'string', + } as const, + components: { + tokenizer: { + stemming: true, + stopWords: englishStopwords, + }, + }, + }) + + await insert(db, { name: "Christopher" }) + let result + + result = await search(db, { term: 'Chris', tolerance: 0 }) + t.equal(result.count, 1) + + result = await search(db, { term: 'Chris', tolerance: 1 }) + t.equal(result.count, 1) + + result = await search(db, { term: 'Chris', tolerance: 2 }) + t.equal(result.count, 1) + + t.end() +}) + async function createSimpleDB() { let i = 0 const db = await create({ diff --git a/packages/plugin-data-persistence/test/index.test.ts b/packages/plugin-data-persistence/test/index.test.ts index 928cf0aa0..8990ef4c3 100644 --- a/packages/plugin-data-persistence/test/index.test.ts +++ b/packages/plugin-data-persistence/test/index.test.ts @@ -60,7 +60,7 @@ async function generateTestDBInstance() { } t.test('binary persistence', (t) => { - t.plan(6) + t.plan(5) t.test('should generate a persistence file on the disk with random name', async (t) => { t.plan(2) @@ -75,6 +75,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary') + t.teardown(rmTeardown(path)) // Load database from disk in binary format const db2 = await restoreFromFile('binary') @@ -90,10 +91,6 @@ t.test('binary persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name', async (t) => { @@ -109,6 +106,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) // Load database from disk in binary format const db2 = await restoreFromFile('binary', 'test.dpack') @@ -124,10 +122,6 @@ t.test('binary persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk using ORAMA_DB_NAME env', async (t) => { @@ -157,6 +151,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary') + t.teardown(rmTeardown(path)) t.match(path, 'example_db_dump') // Load database from disk in binary format @@ -174,9 +169,6 @@ t.test('binary persistence', (t) => { t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - // Clean up - await rm(path) - if (currentOramaDBNameValue) { // @ts-expect-error Deno is only available in Deno if (typeof Deno !== 'undefined') { @@ -186,7 +178,6 @@ t.test('binary persistence', (t) => { process.env.ORAMA_DB_NAME = currentOramaDBNameValue } } - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -199,6 +190,8 @@ t.test('binary persistence', (t) => { }) const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('binary', 'test.dpack') const qp1 = await search(db2, { @@ -208,9 +201,6 @@ t.test('binary persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -223,6 +213,8 @@ t.test('binary persistence', (t) => { }) const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('binary', 'test.dpack') const qp1 = await search(db2, { @@ -232,9 +224,6 @@ t.test('binary persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -254,6 +243,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json') @@ -269,10 +259,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with support for vectors', async (t) => { @@ -290,6 +276,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db1, 'json', 'test.json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json', 'test.json') @@ -306,10 +293,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(qp1.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name and json format', async (t) => { @@ -325,6 +308,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json', 'test.json') @@ -340,10 +324,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -356,6 +336,8 @@ t.test('json persistence', (t) => { }) const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('json', 'test.json') const qp1 = await search(db2, { @@ -365,9 +347,6 @@ t.test('json persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -381,6 +360,8 @@ t.test('json persistence', (t) => { }) const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('json', 'test.json') const qp1 = await search(db2, { @@ -390,9 +371,6 @@ t.test('json persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -413,6 +391,7 @@ t.test('dpack persistence', (t) => { // Persist database on disk in dpack format const path = await persistToFile(db, 'dpack') + t.teardown(rmTeardown(path)) // Load database from disk in dpack format const db2 = await restoreFromFile('dpack') @@ -428,10 +407,6 @@ t.test('dpack persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name and dpack format', async (t) => { @@ -448,6 +423,7 @@ t.test('dpack persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('dpack', 'test.dpack') @@ -463,10 +439,6 @@ t.test('dpack persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -480,6 +452,8 @@ t.test('dpack persistence', (t) => { }) const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('dpack', 'test.dpack') const qp1 = await search(db2, { @@ -489,9 +463,6 @@ t.test('dpack persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -505,6 +476,8 @@ t.test('dpack persistence', (t) => { }) const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('dpack', 'test.dpack') const qp1 = await search(db2, { @@ -514,9 +487,6 @@ t.test('dpack persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -563,11 +533,14 @@ t.test('should persist data in-memory', async (t) => { t.same(q2.hits, qp2.hits) t.same(q1.hits, qp3.hits) t.same(q2.hits, qp4.hits) - t.end() }) t.test('errors', (t) => { + t.plan(2) + t.test('should throw an error when trying to persist a database in an unsupported format', async (t) => { + t.plan(1) + const db = await generateTestDBInstance() try { // @ts-expect-error - 'unsupported' is not a supported format @@ -578,21 +551,24 @@ t.test('errors', (t) => { }) t.test('should throw an error when trying to restoreFromFile a database from an unsupported format', async (t) => { + t.plan(1) + const format = 'unsupported' const db = await generateTestDBInstance() const path = await persistToFile(db, 'binary', 'supported') + t.teardown(rmTeardown(path)) + try { // @ts-expect-error - 'unsupported' is not a supported format await restoreFromFile(format, path) } catch ({ message }) { t.match(message, UNSUPPORTED_FORMAT(format)) - await rm(path) } }) - t.end() }) t.test('should throw an error when trying to use a deprecated method', async (t) => { + t.plan(2) const db = await generateTestDBInstance() try { @@ -606,6 +582,12 @@ t.test('should throw an error when trying to use a deprecated method', async (t) } catch ({ message }) { t.match(message, METHOD_MOVED('restoreFromFile')) } - - t.end() }) + +function rmTeardown(p: string) { + return async () => { + try { + await rm(p) + } catch (e) {} + } +}