From ba725ed0ba9920f551bf818867dfd98f4c8dedef Mon Sep 17 00:00:00 2001 From: Valtteri Karesto Date: Thu, 22 Feb 2024 12:46:09 +0200 Subject: [PATCH 1/3] Threshold 0 is now respected since order of scores list and token amount list is guaranteed --- packages/orama/src/components/algorithms.ts | 48 +++++++++++++-------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/packages/orama/src/components/algorithms.ts b/packages/orama/src/components/algorithms.ts index a7a7d589f..b81d40442 100644 --- a/packages/orama/src/components/algorithms.ts +++ b/packages/orama/src/components/algorithms.ts @@ -12,8 +12,7 @@ export function prioritizeTokenScores( throw createError('INVALID_BOOST_VALUE') } - const tokenScoresMap = new Map() - const tokenKeywordsCountMap = new Map() + const tokenScoresMap = new Map() const mapsLength = arrays.length for (let i = 0; i < mapsLength; i++) { @@ -23,14 +22,14 @@ export function prioritizeTokenScores( for (let j = 0; j < entriesLength; j++) { const [token, score] = arr[j] const boostScore = score * boost - const oldScore = tokenScoresMap.get(token) + const oldScore = tokenScoresMap.get(token)?.[0] if (oldScore !== undefined) { - tokenScoresMap.set(token, oldScore * 1.5 + boostScore) - tokenKeywordsCountMap.set(token, tokenKeywordsCountMap.get(token)! + 1) + tokenScoresMap.set(token, [(oldScore * 1.5 + boostScore), (tokenScoresMap?.get(token)?.[1] || 0) + 1]) + } else { - tokenScoresMap.set(token, boostScore) - tokenKeywordsCountMap.set(token, 1) + tokenScoresMap.set(token, [boostScore, 1]) + } } } @@ -38,7 +37,7 @@ export function prioritizeTokenScores( const tokenScores: TokenScore[] = [] for (const tokenScoreEntry of tokenScoresMap.entries()) { - tokenScores.push(tokenScoreEntry) + tokenScores.push([tokenScoreEntry[0], tokenScoreEntry[1][0]]) } const results = tokenScores.sort((a, b) => b[1] - a[1]) @@ -51,20 +50,30 @@ export function prioritizeTokenScores( // Prepare keywords count tracking for threshold handling const allResults = results.length - const tokenKeywordsCount: [InternalDocumentID, number][] = [] + const tokenScoreWithKeywordsCount: [InternalDocumentID, number, number][] = [] - for (const tokenKeywordsCountEntry of tokenKeywordsCountMap.entries()) { - tokenKeywordsCount.push(tokenKeywordsCountEntry) + for (const tokenScoreEntry of tokenScoresMap.entries()) { + tokenScoreWithKeywordsCount.push([tokenScoreEntry[0], tokenScoreEntry[1][0], tokenScoreEntry[1][1]]) } // Find the index of the last result with all keywords. - // Note that since score is multipled by 1.5 any time the token is encountered in results it means - // that tokenScores and tokenKeywordsCount should always have the same order. - const keywordsPerToken = tokenKeywordsCount.sort((a, b) => b[1] - a[1]) + // Order the documents by the number of keywords they contain, and then by the score. + const keywordsPerToken = tokenScoreWithKeywordsCount.sort((a, b) => { + // Compare by the third element, higher numbers first + if (a[2] > b[2]) return -1; + if (a[2] < b[2]) return 1; + + // If the third elements are equal, compare by the second element, higher numbers first + if (a[1] > b[1]) return -1; + if (a[1] < b[1]) return 1; + + // If both the second and third elements are equal, consider the elements equal + return 0; + }); let lastTokenWithAllKeywords: number | undefined = undefined for (let i = 0; i < allResults; i++) { - if (keywordsPerToken[i][1] === keywordsCount) { + if (keywordsPerToken[i][2] === keywordsCount) { lastTokenWithAllKeywords = i } else { break @@ -80,9 +89,14 @@ export function prioritizeTokenScores( lastTokenWithAllKeywords = 0 } + const resultsWithIdAndScore: [number, number][] = []; + for (let i = 0; i < keywordsPerToken.length; i++) { + resultsWithIdAndScore.push([keywordsPerToken[i][0], keywordsPerToken[i][1]]); + } + // If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match) if (threshold === 0) { - return results.slice(0, lastTokenWithAllKeywords + 1) + return resultsWithIdAndScore.slice(0, lastTokenWithAllKeywords + 1) } // If the threshold is between 0 and 1, we will return all the results that contains at least the threshold of search terms @@ -91,7 +105,7 @@ export function prioritizeTokenScores( const thresholdLength = lastTokenWithAllKeywords + Math.ceil((threshold * 100 * (results.length - lastTokenWithAllKeywords)) / 100) - return results.slice(0, results.length + thresholdLength) + return resultsWithIdAndScore.slice(0, results.length + thresholdLength) } export function BM25( From 12804e8f17209b6aa67e9a5249f3236dfbd540ee Mon Sep 17 00:00:00 2001 From: Valtteri Karesto Date: Thu, 22 Feb 2024 17:49:19 +0200 Subject: [PATCH 2/3] Update packages/orama/src/components/algorithms.ts Co-authored-by: Tommaso Allevi --- packages/orama/src/components/algorithms.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/orama/src/components/algorithms.ts b/packages/orama/src/components/algorithms.ts index b81d40442..96c375791 100644 --- a/packages/orama/src/components/algorithms.ts +++ b/packages/orama/src/components/algorithms.ts @@ -89,10 +89,10 @@ export function prioritizeTokenScores( lastTokenWithAllKeywords = 0 } - const resultsWithIdAndScore: [number, number][] = []; + const resultsWithIdAndScore: [number, number][] = new Array(keywordsPerToken.length); for (let i = 0; i < keywordsPerToken.length; i++) { - resultsWithIdAndScore.push([keywordsPerToken[i][0], keywordsPerToken[i][1]]); - } + resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]]; + // If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match) if (threshold === 0) { From 023d4bdd7fe42850b7a04d18055770491e6095a6 Mon Sep 17 00:00:00 2001 From: Valtteri Karesto Date: Thu, 22 Feb 2024 17:53:11 +0200 Subject: [PATCH 3/3] Github missed a character --- packages/orama/src/components/algorithms.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/orama/src/components/algorithms.ts b/packages/orama/src/components/algorithms.ts index 96c375791..279e97791 100644 --- a/packages/orama/src/components/algorithms.ts +++ b/packages/orama/src/components/algorithms.ts @@ -92,7 +92,7 @@ export function prioritizeTokenScores( const resultsWithIdAndScore: [number, number][] = new Array(keywordsPerToken.length); for (let i = 0; i < keywordsPerToken.length; i++) { resultsWithIdAndScore[i] = [keywordsPerToken[i][0], keywordsPerToken[i][1]]; - + } // If threshold is 0, it means we will only return all the results that contains ALL the search terms (exact match) if (threshold === 0) {