Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefixes have 0 distance #580

Merged
merged 5 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions packages/orama/src/components/levenshtein.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
let lenA = a.length
let lenB = b.length

// ignore common prefix
let startIdx = 0
while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) {
startIdx++
}

// if string A is subfix of B, we consider the distance 0
// because we search for prefix!
// fix https://github.com/oramasearch/orama/issues/544
if (startIdx === lenA) {
return 0
}

// ignore common suffix
// note: `~-` decreases by a unit in a bitwise fashion
while (lenA > 0 && a.charCodeAt(~-lenA) === b.charCodeAt(~-lenB)) {
Expand All @@ -35,17 +48,14 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
return lenB > tolerance ? -1 : lenB
}

// ignore common prefix
let startIdx = 0
while (startIdx < lenA && a.charCodeAt(startIdx) === b.charCodeAt(startIdx)) {
startIdx++
}
lenA -= startIdx
lenB -= startIdx

// early return when the smallest string is empty
if (lenA === 0) {
return lenB > tolerance ? -1 : lenB
// If both strings are smaller than the tolerance, we accept any distance
// Probably the result distance is wrong, but we don't care:
// It is always less then the tolerance!
if (lenA <= tolerance && lenB <= tolerance) {
return lenA > lenB ? lenA : lenB
}

const delta = lenB - lenA
Expand Down
4 changes: 2 additions & 2 deletions packages/orama/src/trees/radix.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,7 @@ function _findLevenshtein(
if (node.e) {
const { w, d: docIDs } = node
if (w) {
const difference = Math.abs(term.length - w.length)
if (difference <= originalTolerance && syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
output[w] = []
}
if (getOwnProperty(output, w) != null && docIDs.length > 0) {
Expand Down Expand Up @@ -268,6 +267,7 @@ export function find(root: Node, { term, exact, tolerance }: FindParams): FindRe
if (tolerance && !exact) {
const output: FindResult = {}
tolerance = tolerance || 0

_findLevenshtein(root, term, 0, tolerance || 0, tolerance, output)
return output
} else {
Expand Down
91 changes: 67 additions & 24 deletions packages/orama/tests/levenshtein.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ t.test('levenshtein', (t) => {
})

t.test('boundedLevenshtein', (t) => {
t.plan(4)
t.plan(3)

t.test('should be 0 when both inputs are empty', async (t) => {
t.plan(2)
Expand All @@ -39,31 +39,12 @@ t.test('boundedLevenshtein', (t) => {
})

t.test('should be the max input length when either strings are empty', async (t) => {
t.plan(2)
t.plan(3)

t.match(await boundedLevenshtein('', 'some', 4), { distance: 4, isBounded: true })
t.match(await boundedLevenshtein('body', '', 4), { distance: 4, isBounded: true })
})
t.match(await boundedLevenshtein('', 'some', 0), { distance: 0, isBounded: true })

t.test('distance should be the same as levenshtein, when tolerance is high enough', async (t) => {
t.plan(5)

const tol = 15

t.equal(levenshtein('aa', 'b'), (await boundedLevenshtein('aa', 'b', tol)).distance)
t.equal(levenshtein('b', 'aa'), (await boundedLevenshtein('bb', 'a', tol)).distance)
t.equal(
levenshtein('somebody once', 'told me'),
(await boundedLevenshtein('somebody once', 'told me', tol)).distance
)
t.equal(
levenshtein('the world is gonna', 'roll me'),
(await boundedLevenshtein('the world is gonna', 'roll me', tol)).distance
)
t.equal(
levenshtein('kaushuk chadhui', 'caushik chakrabar'),
(await boundedLevenshtein('kaushuk chadhui', 'caushik chakrabar', tol)).distance
)
t.match(await boundedLevenshtein('', 'some', 4), { distance: 0, isBounded: true })
t.match(await boundedLevenshtein('body', '', 4), { distance: 0, isBounded: true })
})

t.test('should tell whether the Levenshtein distance is upperbounded by a given tolerance', async (t) => {
Expand All @@ -73,3 +54,65 @@ t.test('boundedLevenshtein', (t) => {
t.match(await boundedLevenshtein('somebody once', 'told me', 8), { isBounded: false })
})
})

t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp', async (t) => {
t.match(await boundedLevenshtein('Dhris', 'Chris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Chris', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 2), { isBounded: true, distance: 2 })
t.match(await boundedLevenshtein('Dhris', 'Cgris', 3), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Dhris', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Cris', 2), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Dhris', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 3), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Dhris', 'Caig', 4), { isBounded: true, distance: 4 })

t.match(await boundedLevenshtein('Chris', 'Chris', 0), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Chris', 1), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Chris', 2), { isBounded: true, distance: 0 })

t.match(await boundedLevenshtein('Chris', 'Cris', 0), { isBounded: false, distance: -1 })

t.match(await boundedLevenshtein('Chris', 'Cris', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Chris', 'Cris', 2), { isBounded: true, distance: 1 })

t.match(await boundedLevenshtein('Chris', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chris', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Craig', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Craig', 'Caig', 1), { isBounded: true, distance: 1 })
t.match(await boundedLevenshtein('Craig', 'Caig', 2), { isBounded: true, distance: 1 })

t.match(await boundedLevenshtein('Chxy', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Cris', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Chxy', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Chxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Crxy', 'Cris', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Cris', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Cris', 2), { isBounded: true, distance: 2 })

t.match(await boundedLevenshtein('Crxy', 'Caig', 0), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 1), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 2), { isBounded: false, distance: -1 })
t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(await boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 })
t.match(await boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 })

t.end()
})
34 changes: 31 additions & 3 deletions packages/orama/tests/search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ t.test('search method', (t) => {

//https://github.com/oramasearch/orama/issues/480
//following testcase pass only if issue 480 is fixed.
t.test('should correctly match with tolerance . even if prefix doesnt match.', async (t) => {
t.test('should correctly match with tolerance. even if prefix doesnt match.', async (t) => {
t.plan(5)

const db = await create({
Expand Down Expand Up @@ -38,10 +38,10 @@ t.test('search method', (t) => {

//issue 480 says following will not match because the prefix "Cr" exists so prefix Ch is not searched.
const result4 = await search(db, { term: 'Cris', tolerance: 1 })
t.equal(result4.count, 1)

//should match "Craig" even if prefix "Ca" exists.
const result5 = await search(db, { term: 'Caig', tolerance: 1 })

t.equal(result4.count, 1)
t.equal(result5.count, 1)
})

Expand Down Expand Up @@ -736,6 +736,34 @@ t.test('search method', (t) => {
t.end()
})

t.test('fix-544', async t => {
const db = await create({
schema: {
name: 'string',
} as const,
components: {
tokenizer: {
stemming: true,
stopWords: englishStopwords,
},
},
})

await insert(db, { name: "Christopher" })
let result

result = await search(db, { term: 'Chris', tolerance: 0 })
t.equal(result.count, 1)

result = await search(db, { term: 'Chris', tolerance: 1 })
t.equal(result.count, 1)

result = await search(db, { term: 'Chris', tolerance: 2 })
t.equal(result.count, 1)

t.end()
})

async function createSimpleDB() {
let i = 0
const db = await create({
Expand Down
Loading