ranking: add phrase boosting to BM25 (#917)

With this change we recognize boosted queries in our bm25 scoring and adjust the overall score accordingly. We need to take care of 2 parts: The overall bm25 score of the document, and the line score determining the order in which we return the chunks. Co-authored-by: Julie Tibshirani <julietibs@apache.org>
sourcegraph · Feb 21, 2025 · 3d43fdf · 3d43fdf
1 parent 456196a
commit 3d43fdf
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 8 deletions.
diff --git a/index/eval.go b/index/eval.go
@@ -316,8 +316,7 @@ nextFileMatch:
 		}
 
 		if opts.UseBM25Scoring {
-			tf := cp.calculateTermFrequency(finalCands)
-			d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
+			d.scoreFilesUsingBM25(&fileMatch, nextDoc, finalCands, cp, opts)
 		} else {
 			// Use the standard, non-experimental scoring method by default
 			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)

diff --git a/index/score.go b/index/score.go
@@ -235,6 +235,8 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
 			}
 		}
 	}
+
+	score = boostScore(score, ms)
 	return score, symbolInfo
 }
 
@@ -263,6 +265,25 @@ func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[st
 	return termFreqs
 }
 
+// boostScore finds whether any of the matches are part of a boosted match tree, then applies
+// the boost to the final score. This follows precedent in other search engines like Lucene, where
+// boosts multiply an entire query clause's final score.
+//
+// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times.
+func boostScore(score float64, ms []*candidateMatch) float64 {
+	maxScoreWeight := 1.0
+	for _, m := range ms {
+		if m.scoreWeight > maxScoreWeight {
+			maxScoreWeight = m.scoreWeight
+		}
+	}
+
+	if !epsilonEqualsOne(maxScoreWeight) {
+		score = score * maxScoreWeight
+	}
+	return score
+}
+
 // scoreFile computes a score for the file match using various scoring signals, like
 // whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
@@ -324,10 +345,11 @@ func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTr
 // keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
 // frequent it appears in the corpus.
 //
-// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
-// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
-// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
-func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
+// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead
+// calculating a score over all matches in the file.
+func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) {
+	tf := cp.calculateTermFrequency(cands)
+
 	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
 	k, b := 1.2, 0.75
 
@@ -343,12 +365,16 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,
 
 	L := fileLength / averageFileLength
 
-	score := 0.0
+	bm25Score := 0.0
 	sumTF := 0 // Just for debugging
 	for _, f := range tf {
 		sumTF += f
-		score += tfScore(k, b, L, f)
+		bm25Score += tfScore(k, b, L, f)
 	}
+
+	score := boostScore(bm25Score, cands)
+	boosted := score != bm25Score
+
 	// 2 digits of precision
 	score = math.Trunc(score*100) / 100
 
@@ -370,5 +396,8 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,
 	if opts.DebugScore {
 		// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
 		fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L)
+		if boosted {
+			fileMatch.Debug += fmt.Sprintf(" (boosted)")
+		}
 	}
 }
diff --git a/internal/e2e/scoring_test.go b/internal/e2e/scoring_test.go
@@ -110,6 +110,23 @@ func TestBM25(t *testing.T) {
 			wantScore: 3.33,
 			// line 59: if (System.nanoTime() > System.currentTimeMillis()) {
 			wantBestLineMatch: 59,
+		}, {
+			// phrase boosting
+			fileName: "example.java",
+			query: &query.Or{Children: []query.Q{
+				&query.Boost{Child: &query.Substring{Pattern: "public string apply"}, Boost: 20},
+				&query.And{Children: []query.Q{
+					&query.Substring{Pattern: "public"},
+					&query.Substring{Pattern: "string"},
+					&query.Substring{Pattern: "apply"},
+				}},
+			}},
+			content:  exampleJava,
+			language: "Java",
+			// sum-termFrequencies: sum-termFrequencies: 40, length-ratio: 1.00
+			wantScore: 140.80,
+			// public String apply(String s) {
+			wantBestLineMatch: 81,
 		},
 		{
 			// Matches only on filename