Skip to content

Commit

Permalink
Fix to get threshold 0 working as documented (#644)
Browse files Browse the repository at this point in the history
Co-authored-by: Tommaso Allevi <tomallevi@gmail.com>
  • Loading branch information
valstu and allevo authored Feb 22, 2024
1 parent 24d98c7 commit 0b69a15
Showing 1 changed file with 31 additions and 17 deletions.
48 changes: 31 additions & 17 deletions packages/orama/src/components/algorithms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ export function prioritizeTokenScores(
throw createError('INVALID_BOOST_VALUE')
}

const tokenScoresMap = new Map<InternalDocumentID, number>()
const tokenKeywordsCountMap = new Map<InternalDocumentID, number>()
const tokenScoresMap = new Map<InternalDocumentID, [number, number]>()

const mapsLength = arrays.length
for (let i = 0; i < mapsLength; i++) {
Expand All @@ -23,22 +22,22 @@ export function prioritizeTokenScores(
for (let j = 0; j < entriesLength; j++) {
const [token, score] = arr[j]
const boostScore = score * boost
const oldScore = tokenScoresMap.get(token)
const oldScore = tokenScoresMap.get(token)?.[0]

if (oldScore !== undefined) {
tokenScoresMap.set(token, oldScore * 1.5 + boostScore)
tokenKeywordsCountMap.set(token, tokenKeywordsCountMap.get(token)! + 1)
tokenScoresMap.set(token, [(oldScore * 1.5 + boostScore), (tokenScoresMap?.get(token)?.[1] || 0) + 1])

} else {
tokenScoresMap.set(token, boostScore)
tokenKeywordsCountMap.set(token, 1)
tokenScoresMap.set(token, [boostScore, 1])

}
}
}

const tokenScores: TokenScore[] = []

for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScores.push(tokenScoreEntry)
tokenScores.push([tokenScoreEntry[0], tokenScoreEntry[1][0]])
}

const results = tokenScores.sort((a, b) => b[1] - a[1])
Expand All @@ -51,20 +50,30 @@ export function prioritizeTokenScores(

// Prepare keywords count tracking for threshold handling
const allResults = results.length
const tokenKeywordsCount: [InternalDocumentID, number][] = []
const tokenScoreWithKeywordsCount: [InternalDocumentID, number, number][] = []

for (const tokenKeywordsCountEntry of tokenKeywordsCountMap.entries()) {
tokenKeywordsCount.push(tokenKeywordsCountEntry)
for (const tokenScoreEntry of tokenScoresMap.entries()) {
tokenScoreWithKeywordsCount.push([tokenScoreEntry[0], tokenScoreEntry[1][0], tokenScoreEntry[1][1]])
}

// Find the index of the last result with all keywords.
// Note that since score is multipled by 1.5 any time the token is encountered in results it means
// that tokenScores and tokenKeywordsCount should always have the same order.
const keywordsPerToken = tokenKeywordsCount.sort((a, b) => b[1] - a[1])
// Order the documents by the number of keywords they contain, and then by the score.
const keywordsPerToken = tokenScoreWithKeywordsCount.sort((a, b) => {
// Compare by the third element, higher numbers first
if (a[2] > b[2]) return -1;
if (a[2] < b[2]) return 1;

// If the third elements are equal, compare by the second element, higher numbers first
if (a[1] > b[1]) return -1;
if (a[1] < b[1]) return 1;

// If both the second and third elements are equal, consider the elements equal
return 0;
});

let lastTokenWithAllKeywords: number | undefined = undefined
for (let i = 0; i < allResults; i++) {
if (keywordsPerToken[i][1] === keywordsCount) {
if (keywordsPerToken[i][2] === keywordsCount) {
lastTokenWithAllKeywords = i
} else {
break
Expand All @@ -80,9 +89,14 @@ export function prioritizeTokenScores(
lastTokenWithAllKeywords = 0
}

const resultsWithIdAndScore: [number, number][] = new Array(keywordsPerToken.length);
for (let i = 0; i < keywordsPerToken.length; i++) {
resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]];
}

// If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match)
if (threshold === 0) {
return results.slice(0, lastTokenWithAllKeywords + 1)
return resultsWithIdAndScore.slice(0, lastTokenWithAllKeywords + 1)
}

// If the threshold is between 0 and 1, we will return all the results that contains at least the threshold of search terms
Expand All @@ -91,7 +105,7 @@ export function prioritizeTokenScores(
const thresholdLength =
lastTokenWithAllKeywords + Math.ceil((threshold * 100 * (results.length - lastTokenWithAllKeywords)) / 100)

return results.slice(0, results.length + thresholdLength)
return resultsWithIdAndScore.slice(0, results.length + thresholdLength)
}

export function BM25(
Expand Down

0 comments on commit 0b69a15

Please sign in to comment.