From 30bf43ba12e1a6e23a16738a39a77e377febbc52 Mon Sep 17 00:00:00 2001 From: Tommaso Allevi Date: Wed, 13 Dec 2023 18:37:15 +0100 Subject: [PATCH 1/5] Prefixes have 0 distance --- packages/orama/src/components/levenshtein.ts | 9 +++++++-- packages/orama/tests/levenshtein.test.ts | 20 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts index cf3bf56d0..46de9d13e 100644 --- a/packages/orama/src/components/levenshtein.ts +++ b/packages/orama/src/components/levenshtein.ts @@ -44,17 +44,22 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { lenB -= startIdx // early return when the smallest string is empty + /* if (lenA === 0) { + if () + console.log('AAAAAAA ---- ') return lenB > tolerance ? -1 : lenB } + */ const delta = lenB - lenA if (tolerance > lenB) { tolerance = lenB - } else if (delta > tolerance) { + }/* else if (delta > tolerance) { + console.log('AAAAAAA ---- ') return -1 - } + }*/ let i = 0 const row: number[] = [] diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts index b5db10ae7..ef9a03310 100644 --- a/packages/orama/tests/levenshtein.test.ts +++ b/packages/orama/tests/levenshtein.test.ts @@ -1,5 +1,5 @@ import t from 'tap' -import { boundedLevenshtein, levenshtein } from '../src/components/levenshtein.js' +import { boundedLevenshtein, levenshtein, syncBoundedLevenshtein } from '../src/components/levenshtein.js' t.test('levenshtein', (t) => { t.plan(3) @@ -29,7 +29,7 @@ t.test('levenshtein', (t) => { }) t.test('boundedLevenshtein', (t) => { - t.plan(4) + t.plan(5) t.test('should be 0 when both inputs are empty', async (t) => { t.plan(2) @@ -72,4 +72,20 @@ t.test('boundedLevenshtein', (t) => { t.match(await boundedLevenshtein('somebody once', 'told me', 9), { isBounded: true }) t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false }) }) + + t.test('foo', async (t) => { + t.plan(3) + + const a = syncBoundedLevenshtein('Chris', 'Christopher', 0) + t.match(a, { distance: 0, isBounded: true }) + console.log(a) + + const b = syncBoundedLevenshtein('Chris', 'Christopher', 1) + t.match(b, { distance: 0, isBounded: true }) + console.log(b) + + const c = syncBoundedLevenshtein('Chris', 'Chriastopher', 1) + t.match(b, { distance: 0, isBounded: true }) + console.log(c) + }) }) From 9f4c2ff50ff37e61d25cbc1e3108174d33275fa7 Mon Sep 17 00:00:00 2001 From: Tommaso Allevi Date: Wed, 13 Dec 2023 18:56:16 +0100 Subject: [PATCH 2/5] Fix test in plugin-data-persistence --- .../test/index.test.ts | 94 ++++++++----------- 1 file changed, 38 insertions(+), 56 deletions(-) diff --git a/packages/plugin-data-persistence/test/index.test.ts b/packages/plugin-data-persistence/test/index.test.ts index 928cf0aa0..8990ef4c3 100644 --- a/packages/plugin-data-persistence/test/index.test.ts +++ b/packages/plugin-data-persistence/test/index.test.ts @@ -60,7 +60,7 @@ async function generateTestDBInstance() { } t.test('binary persistence', (t) => { - t.plan(6) + t.plan(5) t.test('should generate a persistence file on the disk with random name', async (t) => { t.plan(2) @@ -75,6 +75,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary') + t.teardown(rmTeardown(path)) // Load database from disk in binary format const db2 = await restoreFromFile('binary') @@ -90,10 +91,6 @@ t.test('binary persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name', async (t) => { @@ -109,6 +106,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) // Load database from disk in binary format const db2 = await restoreFromFile('binary', 'test.dpack') @@ -124,10 +122,6 @@ t.test('binary persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk using ORAMA_DB_NAME env', async (t) => { @@ -157,6 +151,7 @@ t.test('binary persistence', (t) => { // Persist database on disk in binary format const path = await persistToFile(db, 'binary') + t.teardown(rmTeardown(path)) t.match(path, 'example_db_dump') // Load database from disk in binary format @@ -174,9 +169,6 @@ t.test('binary persistence', (t) => { t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - // Clean up - await rm(path) - if (currentOramaDBNameValue) { // @ts-expect-error Deno is only available in Deno if (typeof Deno !== 'undefined') { @@ -186,7 +178,6 @@ t.test('binary persistence', (t) => { process.env.ORAMA_DB_NAME = currentOramaDBNameValue } } - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -199,6 +190,8 @@ t.test('binary persistence', (t) => { }) const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('binary', 'test.dpack') const qp1 = await search(db2, { @@ -208,9 +201,6 @@ t.test('binary persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -223,6 +213,8 @@ t.test('binary persistence', (t) => { }) const path = await persistToFile(db, 'binary', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('binary', 'test.dpack') const qp1 = await search(db2, { @@ -232,9 +224,6 @@ t.test('binary persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -254,6 +243,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json') @@ -269,10 +259,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with support for vectors', async (t) => { @@ -290,6 +276,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db1, 'json', 'test.json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json', 'test.json') @@ -306,10 +293,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(qp1.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name and json format', async (t) => { @@ -325,6 +308,7 @@ t.test('json persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('json', 'test.json') @@ -340,10 +324,6 @@ t.test('json persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -356,6 +336,8 @@ t.test('json persistence', (t) => { }) const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('json', 'test.json') const qp1 = await search(db2, { @@ -365,9 +347,6 @@ t.test('json persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -381,6 +360,8 @@ t.test('json persistence', (t) => { }) const path = await persistToFile(db, 'json', 'test.json') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('json', 'test.json') const qp1 = await search(db2, { @@ -390,9 +371,6 @@ t.test('json persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -413,6 +391,7 @@ t.test('dpack persistence', (t) => { // Persist database on disk in dpack format const path = await persistToFile(db, 'dpack') + t.teardown(rmTeardown(path)) // Load database from disk in dpack format const db2 = await restoreFromFile('dpack') @@ -428,10 +407,6 @@ t.test('dpack persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should generate a persistence file on the disk with a given name and dpack format', async (t) => { @@ -448,6 +423,7 @@ t.test('dpack persistence', (t) => { // Persist database on disk in json format const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) // Load database from disk in json format const db2 = await restoreFromFile('dpack', 'test.dpack') @@ -463,10 +439,6 @@ t.test('dpack persistence', (t) => { // Queries on the loaded database should match the original database t.same(q1.hits, qp1.hits) t.same(q2.hits, qp2.hits) - - // Clean up - await rm(path) - t.end() }) t.test('should continue to work with `enum`', async (t) => { @@ -480,6 +452,8 @@ t.test('dpack persistence', (t) => { }) const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('dpack', 'test.dpack') const qp1 = await search(db2, { @@ -489,9 +463,6 @@ t.test('dpack persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) t.test('should continue to work with `enum[]`', async (t) => { @@ -505,6 +476,8 @@ t.test('dpack persistence', (t) => { }) const path = await persistToFile(db, 'dpack', 'test.dpack') + t.teardown(rmTeardown(path)) + const db2 = await restoreFromFile('dpack', 'test.dpack') const qp1 = await search(db2, { @@ -514,9 +487,6 @@ t.test('dpack persistence', (t) => { }) t.same(q1.hits, qp1.hits) - - await rm(path) - t.end() }) }) @@ -563,11 +533,14 @@ t.test('should persist data in-memory', async (t) => { t.same(q2.hits, qp2.hits) t.same(q1.hits, qp3.hits) t.same(q2.hits, qp4.hits) - t.end() }) t.test('errors', (t) => { + t.plan(2) + t.test('should throw an error when trying to persist a database in an unsupported format', async (t) => { + t.plan(1) + const db = await generateTestDBInstance() try { // @ts-expect-error - 'unsupported' is not a supported format @@ -578,21 +551,24 @@ t.test('errors', (t) => { }) t.test('should throw an error when trying to restoreFromFile a database from an unsupported format', async (t) => { + t.plan(1) + const format = 'unsupported' const db = await generateTestDBInstance() const path = await persistToFile(db, 'binary', 'supported') + t.teardown(rmTeardown(path)) + try { // @ts-expect-error - 'unsupported' is not a supported format await restoreFromFile(format, path) } catch ({ message }) { t.match(message, UNSUPPORTED_FORMAT(format)) - await rm(path) } }) - t.end() }) t.test('should throw an error when trying to use a deprecated method', async (t) => { + t.plan(2) const db = await generateTestDBInstance() try { @@ -606,6 +582,12 @@ t.test('should throw an error when trying to use a deprecated method', async (t) } catch ({ message }) { t.match(message, METHOD_MOVED('restoreFromFile')) } - - t.end() }) + +function rmTeardown(p: string) { + return async () => { + try { + await rm(p) + } catch (e) {} + } +} From c8da540a778872feae4e93ed4cbdc52cd8eb68fa Mon Sep 17 00:00:00 2001 From: Tommaso Allevi Date: Wed, 13 Dec 2023 19:02:19 +0100 Subject: [PATCH 3/5] Clean up code --- packages/orama/src/components/levenshtein.ts | 14 +------------- packages/orama/tests/levenshtein.test.ts | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts index 46de9d13e..fe435809f 100644 --- a/packages/orama/src/components/levenshtein.ts +++ b/packages/orama/src/components/levenshtein.ts @@ -43,23 +43,11 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { lenA -= startIdx lenB -= startIdx - // early return when the smallest string is empty - /* - if (lenA === 0) { - if () - console.log('AAAAAAA ---- ') - return lenB > tolerance ? -1 : lenB - } - */ - const delta = lenB - lenA if (tolerance > lenB) { tolerance = lenB - }/* else if (delta > tolerance) { - console.log('AAAAAAA ---- ') - return -1 - }*/ + } let i = 0 const row: number[] = [] diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts index ef9a03310..855592f01 100644 --- a/packages/orama/tests/levenshtein.test.ts +++ b/packages/orama/tests/levenshtein.test.ts @@ -73,7 +73,7 @@ t.test('boundedLevenshtein', (t) => { t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false }) }) - t.test('foo', async (t) => { + t.test('substrings are ok even if with tolerance', async (t) => { t.plan(3) const a = syncBoundedLevenshtein('Chris', 'Christopher', 0) From 5fb153ed5c1565d56d3a8b4809f9e287df9cc9ee Mon Sep 17 00:00:00 2001 From: Tommaso Allevi Date: Thu, 14 Dec 2023 11:37:23 +0100 Subject: [PATCH 4/5] Add issue test case and fixes --- packages/orama/src/components/levenshtein.ts | 25 ++++-- packages/orama/src/trees/radix.ts | 4 +- packages/orama/tests/levenshtein.test.ts | 94 +++++++++++++++----- packages/orama/tests/search.test.ts | 36 +++++++- 4 files changed, 127 insertions(+), 32 deletions(-) diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts index fe435809f..297ec2cae 100644 --- a/packages/orama/src/components/levenshtein.ts +++ b/packages/orama/src/components/levenshtein.ts @@ -23,6 +23,19 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { let lenA = a.length let lenB = b.length + // ignore common prefix + let startIdx = 0 + while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) { + startIdx++ + } + + // string A is subfix of B + if (startIdx === lenA) { + return 0 + } + + // console.log({ startIdx, lenA, lenB, tolerance }) + // ignore common suffix // note: `~-` decreases by a unit in a bitwise fashion while (lenA > 0 && a.charCodeAt(~-lenA) === b.charCodeAt(~-lenB)) { @@ -35,18 +48,20 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { return lenB > tolerance ? -1 : lenB } - // ignore common prefix - let startIdx = 0 - while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) { - startIdx++ - } lenA -= startIdx lenB -= startIdx + // early return when the smallest string is empty + if (lenA <= tolerance && lenB <= tolerance) { + return Math.max(lenA, lenB) + } + const delta = lenB - lenA if (tolerance > lenB) { tolerance = lenB + } else if (delta > tolerance) { + return -1 } let i = 0 diff --git a/packages/orama/src/trees/radix.ts b/packages/orama/src/trees/radix.ts index 8b08ddb1f..139a48707 100644 --- a/packages/orama/src/trees/radix.ts +++ b/packages/orama/src/trees/radix.ts @@ -217,8 +217,7 @@ function _findLevenshtein( if (node.e) { const { w, d: docIDs } = node if (w) { - const difference = Math.abs(term.length - w.length) - if (difference <= originalTolerance && syncBoundedLevenshtein(term, w, originalTolerance).isBounded) { + if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) { output[w] = [] } if (getOwnProperty(output, w) != null && docIDs.length > 0) { @@ -268,6 +267,7 @@ export function find(root: Node, { term, exact, tolerance }: FindParams): FindRe if (tolerance && !exact) { const output: FindResult = {} tolerance = tolerance || 0 + _findLevenshtein(root, term, 0, tolerance || 0, tolerance, output) return output } else { diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts index 855592f01..3d0e7c9dd 100644 --- a/packages/orama/tests/levenshtein.test.ts +++ b/packages/orama/tests/levenshtein.test.ts @@ -1,5 +1,5 @@ import t from 'tap' -import { boundedLevenshtein, levenshtein, syncBoundedLevenshtein } from '../src/components/levenshtein.js' +import { boundedLevenshtein, levenshtein } from '../src/components/levenshtein.js' t.test('levenshtein', (t) => { t.plan(3) @@ -28,8 +28,8 @@ t.test('levenshtein', (t) => { }) }) -t.test('boundedLevenshtein', (t) => { - t.plan(5) +t.only('boundedLevenshtein', (t) => { + t.plan(3) t.test('should be 0 when both inputs are empty', async (t) => { t.plan(2) @@ -39,13 +39,16 @@ t.test('boundedLevenshtein', (t) => { }) t.test('should be the max input length when either strings are empty', async (t) => { - t.plan(2) + t.plan(3) - t.match(await boundedLevenshtein('', 'some', 4), { distance: 4, isBounded: true }) - t.match(await boundedLevenshtein('body', '', 4), { distance: 4, isBounded: true }) + t.match(await boundedLevenshtein('', 'some', 0), { distance: 0, isBounded: true }) + + t.match(await boundedLevenshtein('', 'some', 4), { distance: 0, isBounded: true }) + t.match(await boundedLevenshtein('body', '', 4), { distance: 0, isBounded: true }) }) - t.test('distance should be the same as levenshtein, when tolerance is high enough', async (t) => { + /* + t.only('distance should be the same as levenshtein, when tolerance is high enough', async (t) => { t.plan(5) const tol = 15 @@ -65,6 +68,7 @@ t.test('boundedLevenshtein', (t) => { (await boundedLevenshtein('kaushuk chadhui', 'caushik chakrabar', tol)).distance ) }) + */ t.test('should tell whether the Levenshtein distance is upperbounded by a given tolerance', async (t) => { t.plan(2) @@ -72,20 +76,66 @@ t.test('boundedLevenshtein', (t) => { t.match(await boundedLevenshtein('somebody once', 'told me', 9), { isBounded: true }) t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false }) }) +}) - t.test('substrings are ok even if with tolerance', async (t) => { - t.plan(3) - - const a = syncBoundedLevenshtein('Chris', 'Christopher', 0) - t.match(a, { distance: 0, isBounded: true }) - console.log(a) - - const b = syncBoundedLevenshtein('Chris', 'Christopher', 1) - t.match(b, { distance: 0, isBounded: true }) - console.log(b) - - const c = syncBoundedLevenshtein('Chris', 'Chriastopher', 1) - t.match(b, { distance: 0, isBounded: true }) - console.log(c) - }) +t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp', async (t) => { + t.match(await boundedLevenshtein('Dhris', 'Chris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Chris', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 2), { isBounded: true, distance: 2 }) + t.match(await boundedLevenshtein('Dhris', 'Cgris', 3), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Dhris', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Cris', 2), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Dhris', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 3), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Dhris', 'Caig', 4), { isBounded: true, distance: 4 }) + + t.match(await boundedLevenshtein('Chris', 'Chris', 0), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Chris', 1), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Chris', 2), { isBounded: true, distance: 0 }) + + t.match(await boundedLevenshtein('Chris', 'Cris', 0), { isBounded: false, distance: -1 }) + + t.match(await boundedLevenshtein('Chris', 'Cris', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Chris', 'Cris', 2), { isBounded: true, distance: 1 }) + + t.match(await boundedLevenshtein('Chris', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chris', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Craig', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Craig', 'Caig', 1), { isBounded: true, distance: 1 }) + t.match(await boundedLevenshtein('Craig', 'Caig', 2), { isBounded: true, distance: 1 }) + + t.match(await boundedLevenshtein('Chxy', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Cris', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Chxy', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Chxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Crxy', 'Cris', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Cris', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Cris', 2), { isBounded: true, distance: 2 }) + + t.match(await boundedLevenshtein('Crxy', 'Caig', 0), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 1), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 2), { isBounded: false, distance: -1 }) + t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 }) + + t.match(await boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 }) + t.match(await boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 }) + + t.end() }) diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts index 64d8124f4..4148d21b2 100644 --- a/packages/orama/tests/search.test.ts +++ b/packages/orama/tests/search.test.ts @@ -8,7 +8,7 @@ t.test('search method', (t) => { //https://github.com/oramasearch/orama/issues/480 //following testcase pass only if issue 480 is fixed. - t.test('should correctly match with tolerance . even if prefix doesnt match.', async (t) => { + t.test('should correctly match with tolerance. even if prefix doesnt match.', async (t) => { t.plan(5) const db = await create({ @@ -37,11 +37,13 @@ t.test('search method', (t) => { await insert(db, { name: 'Crxy' }) //create r node in radix tree. //issue 480 says following will not match because the prefix "Cr" exists so prefix Ch is not searched. + console.log('AAAAA') const result4 = await search(db, { term: 'Cris', tolerance: 1 }) + t.equal(result4.count, 1) + console.log(result4.hits) + //should match "Craig" even if prefix "Ca" exists. const result5 = await search(db, { term: 'Caig', tolerance: 1 }) - - t.equal(result4.count, 1) t.equal(result5.count, 1) }) @@ -736,6 +738,34 @@ t.test('search method', (t) => { t.end() }) +t.only('fix-544', async t => { + const db = await create({ + schema: { + name: 'string', + } as const, + components: { + tokenizer: { + stemming: true, + stopWords: englishStopwords, + }, + }, + }) + + await insert(db, { name: "Christopher" }) + let result + + result = await search(db, { term: 'Chris', tolerance: 0 }) + t.equal(result.count, 1) + + result = await search(db, { term: 'Chris', tolerance: 1 }) + t.equal(result.count, 1) + + result = await search(db, { term: 'Chris', tolerance: 2 }) + t.equal(result.count, 1) + + t.end() +}) + async function createSimpleDB() { let i = 0 const db = await create({ From 25d614a8bd4b88287f091985ebf57e46600108a5 Mon Sep 17 00:00:00 2001 From: Tommaso Allevi Date: Thu, 14 Dec 2023 11:49:17 +0100 Subject: [PATCH 5/5] Address suggestions --- packages/orama/src/components/levenshtein.ts | 12 ++++++---- packages/orama/tests/levenshtein.test.ts | 25 +------------------- packages/orama/tests/search.test.ts | 4 +--- 3 files changed, 9 insertions(+), 32 deletions(-) diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts index 297ec2cae..0ca7c1e12 100644 --- a/packages/orama/src/components/levenshtein.ts +++ b/packages/orama/src/components/levenshtein.ts @@ -29,13 +29,13 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { startIdx++ } - // string A is subfix of B + // if string A is subfix of B, we consider the distance 0 + // because we search for prefix! + // fix https://github.com/oramasearch/orama/issues/544 if (startIdx === lenA) { return 0 } - // console.log({ startIdx, lenA, lenB, tolerance }) - // ignore common suffix // note: `~-` decreases by a unit in a bitwise fashion while (lenA > 0 && a.charCodeAt(~-lenA) === b.charCodeAt(~-lenB)) { @@ -51,9 +51,11 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { lenA -= startIdx lenB -= startIdx - // early return when the smallest string is empty + // If both strings are smaller than the tolerance, we accept any distance + // Probably the result distance is wrong, but we don't care: + // It is always less then the tolerance! if (lenA <= tolerance && lenB <= tolerance) { - return Math.max(lenA, lenB) + return lenA > lenB ? lenA : lenB } const delta = lenB - lenA diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts index 3d0e7c9dd..a9e2240cc 100644 --- a/packages/orama/tests/levenshtein.test.ts +++ b/packages/orama/tests/levenshtein.test.ts @@ -28,7 +28,7 @@ t.test('levenshtein', (t) => { }) }) -t.only('boundedLevenshtein', (t) => { +t.test('boundedLevenshtein', (t) => { t.plan(3) t.test('should be 0 when both inputs are empty', async (t) => { @@ -47,29 +47,6 @@ t.only('boundedLevenshtein', (t) => { t.match(await boundedLevenshtein('body', '', 4), { distance: 0, isBounded: true }) }) - /* - t.only('distance should be the same as levenshtein, when tolerance is high enough', async (t) => { - t.plan(5) - - const tol = 15 - - t.equal(levenshtein('aa', 'b'), (await boundedLevenshtein('aa', 'b', tol)).distance) - t.equal(levenshtein('b', 'aa'), (await boundedLevenshtein('bb', 'a', tol)).distance) - t.equal( - levenshtein('somebody once', 'told me'), - (await boundedLevenshtein('somebody once', 'told me', tol)).distance - ) - t.equal( - levenshtein('the world is gonna', 'roll me'), - (await boundedLevenshtein('the world is gonna', 'roll me', tol)).distance - ) - t.equal( - levenshtein('kaushuk chadhui', 'caushik chakrabar'), - (await boundedLevenshtein('kaushuk chadhui', 'caushik chakrabar', tol)).distance - ) - }) - */ - t.test('should tell whether the Levenshtein distance is upperbounded by a given tolerance', async (t) => { t.plan(2) diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts index 4148d21b2..a1191c47f 100644 --- a/packages/orama/tests/search.test.ts +++ b/packages/orama/tests/search.test.ts @@ -37,10 +37,8 @@ t.test('search method', (t) => { await insert(db, { name: 'Crxy' }) //create r node in radix tree. //issue 480 says following will not match because the prefix "Cr" exists so prefix Ch is not searched. - console.log('AAAAA') const result4 = await search(db, { term: 'Cris', tolerance: 1 }) t.equal(result4.count, 1) - console.log(result4.hits) //should match "Craig" even if prefix "Ca" exists. const result5 = await search(db, { term: 'Caig', tolerance: 1 }) @@ -738,7 +736,7 @@ t.test('search method', (t) => { t.end() }) -t.only('fix-544', async t => { +t.test('fix-544', async t => { const db = await create({ schema: { name: 'string',