Skip to content

Patch in exact search for meilisearch #29671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {

// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
)

if isMatch {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
} else {
if isFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
} else {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
}

if len(repoIDs) > 0 {
Expand Down
8 changes: 4 additions & 4 deletions modules/indexer/code/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
}

// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypeBestFields
if isMatch {
searchType = esMultiMatchTypePhrasePrefix
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypePhrasePrefix
if isFuzzy {
searchType = esMultiMatchTypeBestFields
}

kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
Expand Down
2 changes: 1 addition & 1 deletion modules/indexer/code/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {

for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
assert.NoError(t, err)
assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs)
Expand Down
4 changes: 2 additions & 2 deletions modules/indexer/code/internal/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type Indexer interface {
internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
}

// NewDummyIndexer returns a dummy indexer
Expand All @@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return fmt.Errorf("indexer is not ready")
}

func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, fmt.Errorf("indexer is not ready")
}
5 changes: 3 additions & 2 deletions modules/indexer/code/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}

// PerformSearch perform a search on a repository
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
if len(keyword) == 0 {
return 0, nil, nil, nil
}

total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
if err != nil {
return 0, nil, nil, err
}
Expand Down
7 changes: 7 additions & 0 deletions modules/indexer/internal/bleve/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
return q
}

// PrefixQuery generates a match prefix query for the given prefix and field
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
q := bleve.NewPrefixQuery(matchPrefix)
q.FieldVal = field
return q
}

// BoolFieldQuery generates a bool field query for the given value and field
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
q := bleve.NewBoolFieldQuery(value)
Expand Down
17 changes: 12 additions & 5 deletions modules/indexer/issues/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query

if options.Keyword != "" {
keywordQueries := []query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
if options.IsFuzzyKeyword {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
}...))
} else {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.PrefixQuery(options.Keyword, "title"),
inner_bleve.PrefixQuery(options.Keyword, "content"),
inner_bleve.PrefixQuery(options.Keyword, "comments"),
}...))
}
queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
}

if len(options.RepoIDs) > 0 || options.AllPublic {
Expand Down
12 changes: 11 additions & 1 deletion modules/indexer/issues/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ import (

const (
issueIndexerLatestVersion = 1
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
)

var _ internal.Indexer = &Indexer{}
Expand Down Expand Up @@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
query := elastic.NewBoolQuery()

if options.Keyword != "" {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))

searchType := esMultiMatchTypePhrasePrefix
if options.IsFuzzyKeyword {
searchType = esMultiMatchTypeBestFields
}

query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
}

if len(options.RepoIDs) > 0 {
Expand Down
2 changes: 2 additions & 0 deletions modules/indexer/issues/internal/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ type SearchResult struct {
type SearchOptions struct {
Keyword string // keyword to search

IsFuzzyKeyword bool // if false the levenshtein distance is 0

RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories

Expand Down
91 changes: 85 additions & 6 deletions modules/indexer/issues/meilisearch/meilisearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package meilisearch

import (
"context"
"errors"
"strconv"
"strings"

Expand All @@ -16,12 +17,15 @@ import (
)

const (
issueIndexerLatestVersion = 2
issueIndexerLatestVersion = 3

// TODO: make this configurable if necessary
maxTotalHits = 10000
)

// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")

var _ internal.Indexer = &Indexer{}

// Indexer implements Indexer interface
Expand All @@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
},
DisplayedAttributes: []string{
"id",
"title",
"content",
"comments",
},
FilterableAttributes: []string{
"repo_id",
Expand Down Expand Up @@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
return nil, err
}

hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hits = append(hits, internal.Match{
ID: int64(hit.(map[string]any)["id"].(float64)),
})
hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
if err != nil {
return nil, err
}

return &internal.SearchResult{
Expand All @@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
}
return field + ":asc"
}

// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
// and you can only change "typo tolerance" per index. So we have to post-filter the results
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hit, ok := hit.(map[string]any)
if !ok {
return nil, ErrMalformedResponse
}

if !isFuzzy {
keyword = strings.ToLower(keyword)

// declare a anon func to check if the title, content or at least one comment contains the keyword
found, err := func() (bool, error) {
// check if title match first
title, ok := hit["title"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(title), keyword) {
return true, nil
}

// check if content has a match
content, ok := hit["content"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(content), keyword) {
return true, nil
}

// now check for each comment if one has a match
// so we first try to cast and skip if there are no comments
comments, ok := hit["comments"].([]any)
if !ok {
return false, ErrMalformedResponse
} else if len(comments) == 0 {
return false, nil
}

// now we iterate over all and report as soon as we detect one match
for i := range comments {
comment, ok := comments[i].(string)
if !ok {
return false, ErrMalformedResponse
}
if strings.Contains(strings.ToLower(comment), keyword) {
return true, nil
}
}

// we got no match
return false, nil
}()

if err != nil {
return nil, err
} else if !found {
continue
}
}
issueID, ok := hit["id"].(float64)
if !ok {
return nil, ErrMalformedResponse
}
hits = append(hits, internal.Match{
ID: int64(issueID),
})
}
return hits, nil
}
45 changes: 45 additions & 0 deletions modules/indexer/issues/meilisearch/meilisearch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ import (
"testing"
"time"

"code.gitea.io/gitea/modules/indexer/issues/internal"
"code.gitea.io/gitea/modules/indexer/issues/internal/tests"

"github.com/meilisearch/meilisearch-go"
"github.com/stretchr/testify/assert"
)

func TestMeilisearchIndexer(t *testing.T) {
Expand Down Expand Up @@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {

tests.TestIndexer(t, indexer)
}

func TestNonFuzzyWorkaround(t *testing.T) {
// get unexpected return
_, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
Hits: []any{"aa", "bb", "cc", "dd"},
}, "bowling", false)
assert.ErrorIs(t, err, ErrMalformedResponse)

validResponse := &meilisearch.SearchResponse{
Hits: []any{
map[string]any{
"id": float64(11),
"title": "a title",
"content": "issue body with no match",
"comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
},
map[string]any{
"id": float64(22),
"title": "Bowling as title",
"content": "",
"comments": []any{},
},
map[string]any{
"id": float64(33),
"title": "Bowl-ing as fuzzy match",
"content": "",
"comments": []any{},
},
},
}

// nonFuzzy
hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)

// fuzzy
hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
}
4 changes: 2 additions & 2 deletions routers/web/explore/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand Down Expand Up @@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
)

if (len(repoIDs) > 0) || isAdmin {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down
4 changes: 2 additions & 2 deletions routers/web/repo/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func Search(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand All @@ -43,7 +43,7 @@ func Search(ctx *context.Context) {
}

total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down
4 changes: 2 additions & 2 deletions routers/web/user/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand Down Expand Up @@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
)

if len(repoIDs) > 0 {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down