Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Code Indexer support wikis too #29726

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1406,9 +1406,9 @@ LEVEL = Info
;; repo indexer by default disabled, since it uses a lot of disk space
;REPO_INDEXER_ENABLED = false
;;
;; repo indexer units, the items to index, could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma.
;; repo indexer units, the items to index, could be `sources`, `forks`, `mirrors`, `templates`, `wikis` or any combination of them separated by a comma.
;; If empty then it defaults to `sources` only, as if you'd like to disable fully please see REPO_INDEXER_ENABLED.
;REPO_INDEXER_REPO_TYPES = sources,forks,mirrors,templates
;REPO_INDEXER_REPO_TYPES = sources,forks,mirrors,templates,wikis
;;
;; Code search engine type, could be `bleve` or `elasticsearch`.
;REPO_INDEXER_TYPE = bleve
Expand Down
2 changes: 1 addition & 1 deletion docs/content/administration/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ relation to port exhaustion.
- `ISSUE_INDEXER_PATH`: **indexers/issues.bleve**: Index file used for issue search; available when ISSUE_INDEXER_TYPE is bleve and elasticsearch. Relative paths will be made absolute against _`AppWorkPath`_.

- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
- `REPO_INDEXER_REPO_TYPES`: **sources,forks,mirrors,templates**: Repo indexer units. The items to index could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma. If empty then it defaults to `sources` only, as if you'd like to disable fully please see `REPO_INDEXER_ENABLED`.
- `REPO_INDEXER_REPO_TYPES`: **sources,forks,mirrors,templates,wikis**: Repo indexer units. The items to index could be `sources`, `forks`, `mirrors`, `templates`, `wikis` or any combination of them separated by a comma. If empty then it defaults to `sources` only, as if you'd like to disable fully please see `REPO_INDEXER_ENABLED`.
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:password@localhost:9200
Expand Down
2 changes: 2 additions & 0 deletions models/repo/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
RepoIndexerTypeCode RepoIndexerType = iota // 0
// RepoIndexerTypeStats repository stats indexer
RepoIndexerTypeStats // 1
// RepoIndexerTypeWiki wiki indexer
RepoIndexerTypeWiki // 2
)

// RepoIndexerStatus status of a repo's entry in the repo indexer
Expand Down
61 changes: 45 additions & 16 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/optional"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
Expand Down Expand Up @@ -51,6 +52,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
IsWiki bool
CommitID string
Content string
Language string
Expand All @@ -65,7 +67,7 @@ func (d *RepoIndexerData) Type() string {
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6
repoIndexerLatestVersion = 7
)

// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
Expand All @@ -75,6 +77,10 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)

boolFieldMapping := bleve.NewBooleanFieldMapping()
boolFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("IsWiki", boolFieldMapping)

textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
Expand Down Expand Up @@ -125,7 +131,7 @@ func NewIndexer(indexDir string) *Indexer {
}

func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
update internal.FileUpdate, repo *repo_model.Repository, isWiki bool, batch *inner_bleve.FlushingBatch,
) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
Expand All @@ -134,10 +140,15 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro

size := update.Size

repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

var err error
if !update.Sized {
var stdout string
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repoPath})
if err != nil {
return err
}
Expand All @@ -147,7 +158,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
}

if size > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
return b.addDelete(update.Filename, repo, isWiki, batch)
}

if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
Expand All @@ -170,53 +181,65 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
if _, err = batchReader.Discard(1); err != nil {
return err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
id := internal.FilenameIndexerID(repo.ID, isWiki, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
IsWiki: isWiki,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, filename)
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, isWiki bool, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, isWiki, filename)
return batch.Delete(id)
}

// Index indexes the data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, isWiki bool, sha string, changes *internal.RepoChanges) error {
repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
if len(changes.Updates) > 0 {

// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
if err := git.EnsureValidGitRepository(ctx, repoPath); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repoPath, repo, err)
return err
}

batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repoPath)
defer cancel()

for _, update := range changes.Updates {
if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, isWiki, batch); err != nil {
return err
}
}
cancel()
}
for _, filename := range changes.RemovedFilenames {
if err := b.addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, isWiki, batch); err != nil {
return err
}
}
return batch.Flush()
}

// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
func (b *Indexer) Delete(_ context.Context, repoID int64, isWiki optional.Option[bool]) error {
var query query.Query
query = inner_bleve.NumericEqualityQuery(repoID, "RepoID")
if isWiki.Has() {
wikiQuery := bleve.NewBoolFieldQuery(isWiki.Value())
wikiQuery.FieldVal = "IsWiki"
query = bleve.NewConjunctionQuery(query, wikiQuery)
}
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
result, err := b.inner.Indexer.Search(searchRequest)
if err != nil {
Expand Down Expand Up @@ -264,6 +287,12 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
indexerQuery = keywordQuery
}

if opts.IsWiki.Has() {
wikiQuery := bleve.NewBoolFieldQuery(opts.IsWiki.Value())
wikiQuery.FieldVal = "IsWiki"
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, wikiQuery)
}

// Save for reuse without language filter
facetQuery := indexerQuery
if len(opts.Language) > 0 {
Expand All @@ -279,7 +308,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int

from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.Fields = []string{"Content", "RepoID", "IsWiki", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true

if len(opts.Language) == 0 {
Expand Down
62 changes: 46 additions & 16 deletions modules/indexer/code/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
"code.gitea.io/gitea/modules/json"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/optional"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
Expand All @@ -29,7 +30,7 @@ import (
)

const (
esRepoIndexerLatestVersion = 1
esRepoIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
Expand Down Expand Up @@ -62,6 +63,10 @@ const (
"type": "long",
"index": true
},
"is_wiki": {
"type": "boolean"
"index": true
}
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
Expand All @@ -84,17 +89,22 @@ const (
}`
)

func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository, isWiki bool) ([]elastic.BulkableRequest, error) {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil, nil
}

size := update.Size
repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

var err error
if !update.Sized {
var stdout string
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repoPath})
if err != nil {
return nil, err
}
Expand All @@ -104,7 +114,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
}

if size > setting.Indexer.MaxIndexerFileSize {
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo, isWiki)}, nil
}

if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
Expand All @@ -127,14 +137,15 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
if _, err = batchReader.Discard(1); err != nil {
return nil, err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
id := internal.FilenameIndexerID(repo.ID, isWiki, update.Filename)

return []elastic.BulkableRequest{
elastic.NewBulkIndexRequest().
Index(b.inner.VersionedIndexName()).
Id(id).
Doc(map[string]any{
"repo_id": repo.ID,
"is_wiki": isWiki,
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
Expand All @@ -143,28 +154,33 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
}, nil
}

func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
id := internal.FilenameIndexerID(repo.ID, filename)
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, isWiki bool) elastic.BulkableRequest {
id := internal.FilenameIndexerID(repo.ID, isWiki, filename)
return elastic.NewBulkDeleteRequest().
Index(b.inner.VersionedIndexName()).
Id(id)
}

// Index will save the index data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, isWiki bool, sha string, changes *internal.RepoChanges) error {
repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

reqs := make([]elastic.BulkableRequest, 0)
if len(changes.Updates) > 0 {
// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
if err := git.EnsureValidGitRepository(ctx, repoPath); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repoPath, repo, err)
return err
}

batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repoPath)
defer cancel()

for _, update := range changes.Updates {
updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo)
updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, isWiki)
if err != nil {
return err
}
Expand All @@ -176,7 +192,7 @@ func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha st
}

for _, filename := range changes.RemovedFilenames {
reqs = append(reqs, b.addDelete(filename, repo))
reqs = append(reqs, b.addDelete(filename, repo, isWiki))
}

if len(reqs) > 0 {
Expand All @@ -196,9 +212,14 @@ func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha st
}

// Delete deletes indexes by ids
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
func (b *Indexer) Delete(ctx context.Context, repoID int64, isWiki optional.Option[bool]) error {
query := elastic.NewBoolQuery()
query = query.Must(elastic.NewTermsQuery("repo_id", repoID))
if isWiki.Has() {
query = query.Must(elastic.NewTermQuery("is_wiki", isWiki.Value()))
}
_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
Query(elastic.NewTermsQuery("repo_id", repoID)).
Query(query).
Do(ctx)
return err
}
Expand Down Expand Up @@ -239,7 +260,11 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
panic(fmt.Sprintf("2===%#v", hit.Highlight))
}

repoID, fileName := internal.ParseIndexerID(hit.Id)
repoID, isWiki, fileName, err := internal.ParseIndexerID(hit.Id)
if err != nil {
return 0, nil, nil, err
}

res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
Expand All @@ -249,6 +274,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)

hits = append(hits, &internal.SearchResult{
RepoID: repoID,
IsWiki: isWiki,
Filename: fileName,
CommitID: res["commit_id"].(string),
Content: res["content"].(string),
Expand Down Expand Up @@ -299,6 +325,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
query = query.Must(repoQuery)
}

if opts.IsWiki.Has() {
query = query.Must(elastic.NewTermQuery("is_wiki", opts.IsWiki.Value()))
}

var (
start, pageSize = opts.GetSkipTake()
kw = "<em>" + opts.Keyword + "</em>"
Expand Down
Loading