Skip to content

Commit

Permalink
Prefixes have 0 distance (#580)
Browse files Browse the repository at this point in the history
  • Loading branch information
allevo authored Dec 14, 2023
1 parent 960ff9c commit 5d56319
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 93 deletions.
26 changes: 18 additions & 8 deletions packages/orama/src/components/levenshtein.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
let lenA = a.length
let lenB = b.length

// ignore common prefix
let startIdx = 0
while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) {
startIdx++
}

// if string A is subfix of B, we consider the distance 0
// because we search for prefix!
// fix https://github.com/oramasearch/orama/issues/544
if (startIdx === lenA) {
return 0
}

// ignore common suffix
// note: `~-` decreases by a unit in a bitwise fashion
while (lenA > 0 && a.charCodeAt(~-lenA) === b.charCodeAt(~-lenB)) {
Expand All @@ -35,17 +48,14 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
return lenB > tolerance ? -1 : lenB
}

// ignore common prefix
let startIdx = 0
while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) {
startIdx++
}
lenA -= startIdx
lenB -= startIdx

// early return when the smallest string is empty
if (lenA === 0) {
return lenB > tolerance ? -1 : lenB
// If both strings are smaller than the tolerance, we accept any distance
// Probably the result distance is wrong, but we don't care:
// It is always less then the tolerance!
if (lenA <= tolerance && lenB <= tolerance) {
return lenA > lenB ? lenA : lenB
}

const delta = lenB - lenA
Expand Down
4 changes: 2 additions & 2 deletions packages/orama/src/trees/radix.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,7 @@ function _findLevenshtein(
if (node.e) {
const { w, d: docIDs } = node
if (w) {
const difference = Math.abs(term.length - w.length)
if (difference <= originalTolerance && syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
output[w] = []
}
if (getOwnProperty(output, w) != null && docIDs.length > 0) {
Expand Down Expand Up @@ -268,6 +267,7 @@ export function find(root: Node, { term, exact, tolerance }: FindParams): FindRe
if (tolerance && !exact) {
const output: FindResult = {}
tolerance = tolerance || 0

_findLevenshtein(root, term, 0, tolerance || 0, tolerance, output)
return output
} else {
Expand Down
91 changes: 67 additions & 24 deletions packages/orama/tests/levenshtein.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ t.test('levenshtein', (t) => {
})

t.test('boundedLevenshtein', (t) => {
t.plan(4)
t.plan(3)

t.test('should be 0 when both inputs are empty', async (t) => {
t.plan(2)
Expand All @@ -39,31 +39,12 @@ t.test('boundedLevenshtein', (t) => {
})

t.test('should be the max input length when either strings are empty', async (t) => {
t.plan(2)
t.plan(3)

t.match(await boundedLevenshtein('', 'some', 4), { distance: 4, isBounded: true })
t.match(await boundedLevenshtein('body', '', 4), { distance: 4, isBounded: true })
})
t.match(await boundedLevenshtein('', 'some', 0), { distance: 0, isBounded: true })

t.test('distance should be the same as levenshtein, when tolerance is high enough', async (t) => {
t.plan(5)

const tol = 15

t.equal(levenshtein('aa', 'b'), (await boundedLevenshtein('aa', 'b', tol)).distance)
t.equal(levenshtein('b', 'aa'), (await boundedLevenshtein('bb', 'a', tol)).distance)
t.equal(
levenshtein('somebody once', 'told me'),
(await boundedLevenshtein('somebody once', 'told me', tol)).distance
)
t.equal(
levenshtein('the world is gonna', 'roll me'),
(await boundedLevenshtein('the world is gonna', 'roll me', tol)).distance
)
t.equal(
levenshtein('kaushuk chadhui', 'caushik chakrabar'),
(await boundedLevenshtein('kaushuk chadhui', 'caushik chakrabar', tol)).distance
)
t.match(await boundedLevenshtein('', 'some', 4), { distance: 0, isBounded: true })
t.match(await boundedLevenshtein('body', '', 4), { distance: 0, isBounded: true })
})

t.test('should tell whether the Levenshtein distance is upperbounded by a given tolerance', async (t) => {
Expand All @@ -73,3 +54,65 @@ t.test('boundedLevenshtein', (t) => {
t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false })
})
})

t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp', async (t) => {
t.match(await boundedLevenshtein('Dhris', 'Chris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Chris', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 2), { isBounded: true, distance: 2 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 3), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Dhris', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cris', 2), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Dhris', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 3), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 4), { isBounded: true, distance: 4 })

t.match(await boundedLevenshtein('Chris', 'Chris', 0), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Chris', 1), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Chris', 2), { isBounded: true, distance: 0 })

t.match(await boundedLevenshtein('Chris', 'Cris', 0), { isBounded: false, distance: -1 })

t.match(await boundedLevenshtein('Chris', 'Cris', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Chris', 'Cris', 2), { isBounded: true, distance: 1 })

t.match(await boundedLevenshtein('Chris', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Craig', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Craig', 'Caig', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Craig', 'Caig', 2), { isBounded: true, distance: 1 })

t.match(await boundedLevenshtein('Chxy', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Chxy', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Crxy', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Cris', 2), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Crxy', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 })

t.end()
})
34 changes: 31 additions & 3 deletions packages/orama/tests/search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ t.test('search method', (t) => {

//https://github.com/oramasearch/orama/issues/480
//following testcase pass only if issue 480 is fixed.
t.test('should correctly match with tolerance . even if prefix doesnt match.', async (t) => {
t.test('should correctly match with tolerance. even if prefix doesnt match.', async (t) => {
t.plan(5)

const db = await create({
Expand Down Expand Up @@ -38,10 +38,10 @@ t.test('search method', (t) => {

//issue 480 says following will not match because the prefix "Cr" exists so prefix Ch is not searched.
const result4 = await search(db, { term: 'Cris', tolerance: 1 })
t.equal(result4.count, 1)

//should match "Craig" even if prefix "Ca" exists.
const result5 = await search(db, { term: 'Caig', tolerance: 1 })

t.equal(result4.count, 1)
t.equal(result5.count, 1)
})

Expand Down Expand Up @@ -736,6 +736,34 @@ t.test('search method', (t) => {
t.end()
})

t.test('fix-544', async t => {
const db = await create({
schema: {
name: 'string',
} as const,
components: {
tokenizer: {
stemming: true,
stopWords: englishStopwords,
},
},
})

await insert(db, { name: "Christopher" })
let result

result = await search(db, { term: 'Chris', tolerance: 0 })
t.equal(result.count, 1)

result = await search(db, { term: 'Chris', tolerance: 1 })
t.equal(result.count, 1)

result = await search(db, { term: 'Chris', tolerance: 2 })
t.equal(result.count, 1)

t.end()
})

async function createSimpleDB() {
let i = 0
const db = await create({
Expand Down
Loading

0 comments on commit 5d56319

Please sign in to comment.