Skip to content

Commit

Permalink
ranking: add phrase boosting to BM25 (#917)
Browse files Browse the repository at this point in the history
With this change we recognize boosted queries in our bm25 scoring and
adjust the overall score accordingly.

We need to take care of 2 parts: The overall bm25 score of the document,
and the line score determining the order in which we return the chunks.

Co-authored-by: Julie Tibshirani <julietibs@apache.org>
  • Loading branch information
stefanhengl and jtibshirani authored Feb 21, 2025
1 parent 456196a commit 3d43fdf
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 8 deletions.
3 changes: 1 addition & 2 deletions index/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,7 @@ nextFileMatch:
}

if opts.UseBM25Scoring {
tf := cp.calculateTermFrequency(finalCands)
d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
d.scoreFilesUsingBM25(&fileMatch, nextDoc, finalCands, cp, opts)
} else {
// Use the standard, non-experimental scoring method by default
d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
Expand Down
41 changes: 35 additions & 6 deletions index/score.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
}
}
}

score = boostScore(score, ms)
return score, symbolInfo
}

Expand Down Expand Up @@ -263,6 +265,25 @@ func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[st
return termFreqs
}

// boostScore finds whether any of the matches are part of a boosted match tree, then applies
// the boost to the final score. This follows precedent in other search engines like Lucene, where
// boosts multiply an entire query clause's final score.
//
// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times.
func boostScore(score float64, ms []*candidateMatch) float64 {
maxScoreWeight := 1.0
for _, m := range ms {
if m.scoreWeight > maxScoreWeight {
maxScoreWeight = m.scoreWeight
}
}

if !epsilonEqualsOne(maxScoreWeight) {
score = score * maxScoreWeight
}
return score
}

// scoreFile computes a score for the file match using various scoring signals, like
// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
Expand Down Expand Up @@ -324,10 +345,11 @@ func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTr
// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
// frequent it appears in the corpus.
//
// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead
// calculating a score over all matches in the file.
func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) {
tf := cp.calculateTermFrequency(cands)

// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
k, b := 1.2, 0.75

Expand All @@ -343,12 +365,16 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,

L := fileLength / averageFileLength

score := 0.0
bm25Score := 0.0
sumTF := 0 // Just for debugging
for _, f := range tf {
sumTF += f
score += tfScore(k, b, L, f)
bm25Score += tfScore(k, b, L, f)
}

score := boostScore(bm25Score, cands)
boosted := score != bm25Score

// 2 digits of precision
score = math.Trunc(score*100) / 100

Expand All @@ -370,5 +396,8 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,
if opts.DebugScore {
// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L)
if boosted {
fileMatch.Debug += fmt.Sprintf(" (boosted)")
}
}
}
17 changes: 17 additions & 0 deletions internal/e2e/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,23 @@ func TestBM25(t *testing.T) {
wantScore: 3.33,
// line 59: if (System.nanoTime() > System.currentTimeMillis()) {
wantBestLineMatch: 59,
}, {
// phrase boosting
fileName: "example.java",
query: &query.Or{Children: []query.Q{
&query.Boost{Child: &query.Substring{Pattern: "public string apply"}, Boost: 20},
&query.And{Children: []query.Q{
&query.Substring{Pattern: "public"},
&query.Substring{Pattern: "string"},
&query.Substring{Pattern: "apply"},
}},
}},
content: exampleJava,
language: "Java",
// sum-termFrequencies: sum-termFrequencies: 40, length-ratio: 1.00
wantScore: 140.80,
// public String apply(String s) {
wantBestLineMatch: 81,
},
{
// Matches only on filename
Expand Down

0 comments on commit 3d43fdf

Please sign in to comment.