Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to get threshold 0 working as documented #644

Merged
merged 3 commits into from
Feb 22, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 31 additions & 17 deletions packages/orama/src/components/algorithms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ export function prioritizeTokenScores(
throw createError('INVALID_BOOST_VALUE')
}

const tokenScoresMap = new Map<InternalDocumentID, number>()
const tokenKeywordsCountMap = new Map<InternalDocumentID, number>()
const tokenScoresMap = new Map<InternalDocumentID, [number, number]>()

const mapsLength = arrays.length
for (let i = 0; i < mapsLength; i++) {
Expand All @@ -23,22 +22,22 @@ export function prioritizeTokenScores(
for (let j = 0; j < entriesLength; j++) {
const [token, score] = arr[j]
const boostScore = score * boost
const oldScore = tokenScoresMap.get(token)
const oldScore = tokenScoresMap.get(token)?.[0]

if (oldScore !== undefined) {
tokenScoresMap.set(token, oldScore * 1.5 + boostScore)
tokenKeywordsCountMap.set(token, tokenKeywordsCountMap.get(token)! + 1)
tokenScoresMap.set(token, [(oldScore * 1.5 + boostScore), (tokenScoresMap?.get(token)?.[1] || 0) + 1])

} else {
tokenScoresMap.set(token, boostScore)
tokenKeywordsCountMap.set(token, 1)
tokenScoresMap.set(token, [boostScore, 1])

}
}
}

const tokenScores: TokenScore[] = []

for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScores.push(tokenScoreEntry)
tokenScores.push([tokenScoreEntry[0], tokenScoreEntry[1][0]])
}

const results = tokenScores.sort((a, b) => b[1] - a[1])
Expand All @@ -51,20 +50,30 @@ export function prioritizeTokenScores(

// Prepare keywords count tracking for threshold handling
const allResults = results.length
const tokenKeywordsCount: [InternalDocumentID, number][] = []
const tokenScoreWithKeywordsCount: [InternalDocumentID, number, number][] = []

for (const tokenKeywordsCountEntry of tokenKeywordsCountMap.entries()) {
tokenKeywordsCount.push(tokenKeywordsCountEntry)
for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScoreWithKeywordsCount.push([tokenScoreEntry[0], tokenScoreEntry[1][0], tokenScoreEntry[1][1]])
}

// Find the index of the last result with all keywords.
// Note that since score is multipled by 1.5 any time the token is encountered in results it means
// that tokenScores and tokenKeywordsCount should always have the same order.
const keywordsPerToken = tokenKeywordsCount.sort((a, b) => b[1] - a[1])
// Order the documents by the number of keywords they contain, and then by the score.
const keywordsPerToken = tokenScoreWithKeywordsCount.sort((a, b) => {
// Compare by the third element, higher numbers first
if (a[2] > b[2]) return -1;
if (a[2] < b[2]) return 1;

// If the third elements are equal, compare by the second element, higher numbers first
if (a[1] > b[1]) return -1;
if (a[1] < b[1]) return 1;

// If both the second and third elements are equal, consider the elements equal
return 0;
});

let lastTokenWithAllKeywords: number | undefined = undefined
for (let i = 0; i < allResults; i++) {
if (keywordsPerToken[i][1] === keywordsCount) {
if (keywordsPerToken[i][2] === keywordsCount) {
lastTokenWithAllKeywords = i
} else {
break
Expand All @@ -80,9 +89,14 @@ export function prioritizeTokenScores(
lastTokenWithAllKeywords = 0
}

const resultsWithIdAndScore: [number, number][] = new Array(keywordsPerToken.length);
for (let i = 0; i < keywordsPerToken.length; i++) {
resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]];


// If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match)
if (threshold === 0) {
return results.slice(0, lastTokenWithAllKeywords + 1)
return resultsWithIdAndScore.slice(0, lastTokenWithAllKeywords + 1)
}

// If the threshold is between 0 and 1, we will return all the results that contains at least the threshold of search terms
Expand All @@ -91,7 +105,7 @@ export function prioritizeTokenScores(
const thresholdLength =
lastTokenWithAllKeywords + Math.ceil((threshold * 100 * (results.length - lastTokenWithAllKeywords)) / 100)

return results.slice(0, results.length + thresholdLength)
return resultsWithIdAndScore.slice(0, results.length + thresholdLength)
}

export function BM25(
Expand Down
Loading