From 2a278b996fd6608973c3ab2a2cfb584e67d5bd8b Mon Sep 17 00:00:00 2001
From: KN4CK3R <admin@oldschoolhack.me>
Date: Fri, 23 Feb 2024 18:24:27 +0100
Subject: [PATCH] Add support for `linguist-detectable` and
 `linguist-documentation` (#29267)

Add support for `linguist-detectable` and `linguist-documentation`
Add tests for the attributes


https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#detectable

https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#documentation
---
 modules/git/repo_attribute.go              |  23 +-
 modules/git/repo_language_stats_gogit.go   |  75 +++---
 modules/git/repo_language_stats_nogogit.go |  75 +++---
 tests/integration/linguist_test.go         | 259 +++++++++++++++++++++
 4 files changed, 363 insertions(+), 69 deletions(-)
 create mode 100644 tests/integration/linguist_test.go

diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go
index 2b34f117f72f7..44f13ddc2d172 100644
--- a/modules/git/repo_attribute.go
+++ b/modules/git/repo_attribute.go
@@ -11,6 +11,7 @@ import (
 	"os"
 
 	"code.gitea.io/gitea/modules/log"
+	"code.gitea.io/gitea/modules/optional"
 )
 
 // CheckAttributeOpts represents the possible options to CheckAttribute
@@ -291,7 +292,7 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe
 	}
 
 	checker := &CheckAttributeReader{
-		Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language"},
+		Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language", "linguist-documentation", "linguist-detectable"},
 		Repo:       repo,
 		IndexFile:  indexFilename,
 		WorkTree:   worktree,
@@ -316,3 +317,23 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe
 
 	return checker, deferable
 }
+
+// true if "set"/"true", false if "unset"/"false", none otherwise
+func attributeToBool(attr map[string]string, name string) optional.Option[bool] {
+	if value, has := attr[name]; has && value != "unspecified" {
+		switch value {
+		case "set", "true":
+			return optional.Some(true)
+		case "unset", "false":
+			return optional.Some(false)
+		}
+	}
+	return optional.None[bool]()
+}
+
+func attributeToString(attr map[string]string, name string) optional.Option[string] {
+	if value, has := attr[name]; has && value != "unspecified" {
+		return optional.Some(value)
+	}
+	return optional.None[string]()
+}
diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go
index 4c6fbd6c7efbd..99c7a894d5192 100644
--- a/modules/git/repo_language_stats_gogit.go
+++ b/modules/git/repo_language_stats_gogit.go
@@ -11,6 +11,7 @@ import (
 	"strings"
 
 	"code.gitea.io/gitea/modules/analyze"
+	"code.gitea.io/gitea/modules/optional"
 
 	"github.com/go-enry/go-enry/v2"
 	"github.com/go-git/go-git/v5"
@@ -57,25 +58,47 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 			return nil
 		}
 
-		notVendored := false
-		notGenerated := false
+		isVendored := optional.None[bool]()
+		isGenerated := optional.None[bool]()
+		isDocumentation := optional.None[bool]()
+		isDetectable := optional.None[bool]()
 
 		if checker != nil {
 			attrs, err := checker.CheckPath(f.Name)
 			if err == nil {
-				if vendored, has := attrs["linguist-vendored"]; has {
-					if vendored == "set" || vendored == "true" {
-						return nil
-					}
-					notVendored = vendored == "false"
+				isVendored = attributeToBool(attrs, "linguist-vendored")
+				if isVendored.ValueOrDefault(false) {
+					return nil
+				}
+
+				isGenerated = attributeToBool(attrs, "linguist-generated")
+				if isGenerated.ValueOrDefault(false) {
+					return nil
 				}
-				if generated, has := attrs["linguist-generated"]; has {
-					if generated == "set" || generated == "true" {
-						return nil
+
+				isDocumentation = attributeToBool(attrs, "linguist-documentation")
+				if isDocumentation.ValueOrDefault(false) {
+					return nil
+				}
+
+				isDetectable = attributeToBool(attrs, "linguist-detectable")
+				if !isDetectable.ValueOrDefault(true) {
+					return nil
+				}
+
+				hasLanguage := attributeToString(attrs, "linguist-language")
+				if hasLanguage.Value() == "" {
+					hasLanguage = attributeToString(attrs, "gitlab-language")
+					if hasLanguage.Has() {
+						language := hasLanguage.Value()
+						if idx := strings.IndexByte(language, '?'); idx >= 0 {
+							hasLanguage = optional.Some(language[:idx])
+						}
 					}
-					notGenerated = generated == "false"
 				}
-				if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
+				if hasLanguage.Value() != "" {
+					language := hasLanguage.Value()
+
 					// group languages, such as Pug -> HTML; SCSS -> CSS
 					group := enry.GetLanguageGroup(language)
 					if len(group) != 0 {
@@ -85,28 +108,14 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 					// this language will always be added to the size
 					sizes[language] += f.Size
 					return nil
-				} else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" {
-					// strip off a ? if present
-					if idx := strings.IndexByte(language, '?'); idx >= 0 {
-						language = language[:idx]
-					}
-					if len(language) != 0 {
-						// group languages, such as Pug -> HTML; SCSS -> CSS
-						group := enry.GetLanguageGroup(language)
-						if len(group) != 0 {
-							language = group
-						}
-
-						// this language will always be added to the size
-						sizes[language] += f.Size
-						return nil
-					}
 				}
 			}
 		}
 
-		if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) ||
-			enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
+		if (!isVendored.Has() && analyze.IsVendor(f.Name)) ||
+			enry.IsDotFile(f.Name) ||
+			(!isDocumentation.Has() && enry.IsDocumentation(f.Name)) ||
+			enry.IsConfiguration(f.Name) {
 			return nil
 		}
 
@@ -115,12 +124,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		if f.Size <= bigFileSize {
 			content, _ = readFile(f, fileSizeLimit)
 		}
-		if !notGenerated && enry.IsGenerated(f.Name, content) {
+		if !isGenerated.Has() && enry.IsGenerated(f.Name, content) {
 			return nil
 		}
 
-		// TODO: Use .gitattributes file for linguist overrides
-
 		language := analyze.GetCodeLanguage(f.Name, content)
 		if language == enry.OtherLanguage || language == "" {
 			return nil
@@ -138,7 +145,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 			included = langtype == enry.Programming || langtype == enry.Markup
 			includedLanguage[language] = included
 		}
-		if included {
+		if included || isDetectable.ValueOrDefault(false) {
 			sizes[language] += f.Size
 		} else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
 			firstExcludedLanguage = language
diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go
index d68d7d210a38d..16669924d65bb 100644
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@@ -12,6 +12,7 @@ import (
 
 	"code.gitea.io/gitea/modules/analyze"
 	"code.gitea.io/gitea/modules/log"
+	"code.gitea.io/gitea/modules/optional"
 
 	"github.com/go-enry/go-enry/v2"
 )
@@ -88,25 +89,47 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 			continue
 		}
 
-		notVendored := false
-		notGenerated := false
+		isVendored := optional.None[bool]()
+		isGenerated := optional.None[bool]()
+		isDocumentation := optional.None[bool]()
+		isDetectable := optional.None[bool]()
 
 		if checker != nil {
 			attrs, err := checker.CheckPath(f.Name())
 			if err == nil {
-				if vendored, has := attrs["linguist-vendored"]; has {
-					if vendored == "set" || vendored == "true" {
-						continue
-					}
-					notVendored = vendored == "false"
+				isVendored = attributeToBool(attrs, "linguist-vendored")
+				if isVendored.ValueOrDefault(false) {
+					continue
+				}
+
+				isGenerated = attributeToBool(attrs, "linguist-generated")
+				if isGenerated.ValueOrDefault(false) {
+					continue
 				}
-				if generated, has := attrs["linguist-generated"]; has {
-					if generated == "set" || generated == "true" {
-						continue
+
+				isDocumentation = attributeToBool(attrs, "linguist-documentation")
+				if isDocumentation.ValueOrDefault(false) {
+					continue
+				}
+
+				isDetectable = attributeToBool(attrs, "linguist-detectable")
+				if !isDetectable.ValueOrDefault(true) {
+					continue
+				}
+
+				hasLanguage := attributeToString(attrs, "linguist-language")
+				if hasLanguage.Value() == "" {
+					hasLanguage = attributeToString(attrs, "gitlab-language")
+					if hasLanguage.Has() {
+						language := hasLanguage.Value()
+						if idx := strings.IndexByte(language, '?'); idx >= 0 {
+							hasLanguage = optional.Some(language[:idx])
+						}
 					}
-					notGenerated = generated == "false"
 				}
-				if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
+				if hasLanguage.Value() != "" {
+					language := hasLanguage.Value()
+
 					// group languages, such as Pug -> HTML; SCSS -> CSS
 					group := enry.GetLanguageGroup(language)
 					if len(group) != 0 {
@@ -116,29 +139,14 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 					// this language will always be added to the size
 					sizes[language] += f.Size()
 					continue
-				} else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" {
-					// strip off a ? if present
-					if idx := strings.IndexByte(language, '?'); idx >= 0 {
-						language = language[:idx]
-					}
-					if len(language) != 0 {
-						// group languages, such as Pug -> HTML; SCSS -> CSS
-						group := enry.GetLanguageGroup(language)
-						if len(group) != 0 {
-							language = group
-						}
-
-						// this language will always be added to the size
-						sizes[language] += f.Size()
-						continue
-					}
 				}
-
 			}
 		}
 
-		if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) ||
-			enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
+		if (!isVendored.Has() && analyze.IsVendor(f.Name())) ||
+			enry.IsDotFile(f.Name()) ||
+			(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
+			enry.IsConfiguration(f.Name()) {
 			continue
 		}
 
@@ -170,7 +178,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 				return nil, err
 			}
 		}
-		if !notGenerated && enry.IsGenerated(f.Name(), content) {
+		if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) {
 			continue
 		}
 
@@ -193,13 +201,12 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 			included = langType == enry.Programming || langType == enry.Markup
 			includedLanguage[language] = included
 		}
-		if included {
+		if included || isDetectable.ValueOrDefault(false) {
 			sizes[language] += f.Size()
 		} else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
 			firstExcludedLanguage = language
 			firstExcludedLanguageSize += f.Size()
 		}
-		continue
 	}
 
 	// If there are no included languages add the first excluded language
diff --git a/tests/integration/linguist_test.go b/tests/integration/linguist_test.go
new file mode 100644
index 0000000000000..e569de93a8566
--- /dev/null
+++ b/tests/integration/linguist_test.go
@@ -0,0 +1,259 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package integration
+
+import (
+	"context"
+	"net/url"
+	"strings"
+	"testing"
+	"time"
+
+	"code.gitea.io/gitea/models/db"
+	repo_model "code.gitea.io/gitea/models/repo"
+	"code.gitea.io/gitea/models/unittest"
+	user_model "code.gitea.io/gitea/models/user"
+	"code.gitea.io/gitea/modules/git"
+	"code.gitea.io/gitea/modules/indexer/stats"
+	"code.gitea.io/gitea/modules/queue"
+	repo_service "code.gitea.io/gitea/services/repository"
+	files_service "code.gitea.io/gitea/services/repository/files"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestLinguist(t *testing.T) {
+	onGiteaRun(t, func(t *testing.T, _ *url.URL) {
+		user := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2})
+
+		cppContent := "#include <iostream>\nint main() {\nstd::cout << \"Hello Gitea!\";\nreturn 0;\n}"
+		pyContent := "print(\"Hello Gitea!\")"
+		phpContent := "<?php\necho 'Hallo Welt';\n?>"
+		lockContent := "# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand."
+		mdContent := "markdown"
+
+		cases := []struct {
+			GitAttributesContent  string
+			FilesToAdd            []*files_service.ChangeRepoFile
+			ExpectedLanguageOrder []string
+		}{
+			// case 0
+			{
+				ExpectedLanguageOrder: []string{},
+			},
+			// case 1
+			{
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+					{
+						TreePath:      "python.py",
+						ContentReader: strings.NewReader(pyContent),
+					},
+					{
+						TreePath:      "php.php",
+						ContentReader: strings.NewReader(phpContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"C++", "PHP", "Python"},
+			},
+			// case 2
+			{
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      ".cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+					{
+						TreePath:      "python.py",
+						ContentReader: strings.NewReader(pyContent),
+					},
+					{
+						TreePath:      "vendor/php.php",
+						ContentReader: strings.NewReader(phpContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Python"},
+			},
+			// case 3
+			{
+				GitAttributesContent: "*.cpp linguist-language=Go",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Go"},
+			},
+			// case 4
+			{
+				GitAttributesContent: "*.cpp gitlab-language=Go?parent=json",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Go"},
+			},
+			// case 5
+			{
+				GitAttributesContent: "*.cpp linguist-language=HTML gitlab-language=Go?parent=json",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"HTML"},
+			},
+			// case 6
+			{
+				GitAttributesContent: "vendor/** linguist-vendored=false",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "vendor/php.php",
+						ContentReader: strings.NewReader(phpContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"PHP"},
+			},
+			// case 7
+			{
+				GitAttributesContent: "*.cpp linguist-vendored=true\n*.py linguist-vendored\nvendor/** -linguist-vendored",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+					{
+						TreePath:      "python.py",
+						ContentReader: strings.NewReader(pyContent),
+					},
+					{
+						TreePath:      "vendor/php.php",
+						ContentReader: strings.NewReader(phpContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"PHP"},
+			},
+			// case 8
+			{
+				GitAttributesContent: "poetry.lock linguist-language=Go",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "poetry.lock",
+						ContentReader: strings.NewReader(lockContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Go"},
+			},
+			// case 9
+			{
+				GitAttributesContent: "poetry.lock linguist-generated=false",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "poetry.lock",
+						ContentReader: strings.NewReader(lockContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"TOML"},
+			},
+			// case 10
+			{
+				GitAttributesContent: "*.cpp -linguist-detectable",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{},
+			},
+			// case 11
+			{
+				GitAttributesContent: "*.md linguist-detectable",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "test.md",
+						ContentReader: strings.NewReader(mdContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Markdown"},
+			},
+			// case 12
+			{
+				GitAttributesContent: "test2.md linguist-detectable",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "cplusplus.cpp",
+						ContentReader: strings.NewReader(cppContent),
+					},
+					{
+						TreePath:      "test.md",
+						ContentReader: strings.NewReader(mdContent),
+					},
+					{
+						TreePath:      "test2.md",
+						ContentReader: strings.NewReader(mdContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"C++", "Markdown"},
+			},
+			// case 13
+			{
+				GitAttributesContent: "README.md linguist-documentation=false",
+				FilesToAdd: []*files_service.ChangeRepoFile{
+					{
+						TreePath:      "README.md",
+						ContentReader: strings.NewReader(mdContent),
+					},
+				},
+				ExpectedLanguageOrder: []string{"Markdown"},
+			},
+		}
+
+		for i, c := range cases {
+			repo, err := repo_service.CreateRepository(db.DefaultContext, user, user, repo_service.CreateRepoOptions{
+				Name: "linguist-test",
+			})
+			assert.NoError(t, err)
+
+			files := []*files_service.ChangeRepoFile{
+				{
+					TreePath:      ".gitattributes",
+					ContentReader: strings.NewReader(c.GitAttributesContent),
+				},
+			}
+			files = append(files, c.FilesToAdd...)
+			for _, f := range files {
+				f.Operation = "create"
+			}
+
+			_, err = files_service.ChangeRepoFiles(git.DefaultContext, repo, user, &files_service.ChangeRepoFilesOptions{
+				Files:     files,
+				OldBranch: repo.DefaultBranch,
+				NewBranch: repo.DefaultBranch,
+			})
+			assert.NoError(t, err)
+
+			assert.NoError(t, stats.UpdateRepoIndexer(repo))
+			assert.NoError(t, queue.GetManager().FlushAll(context.Background(), 10*time.Second))
+
+			stats, err := repo_model.GetTopLanguageStats(db.DefaultContext, repo, len(c.FilesToAdd))
+			assert.NoError(t, err)
+
+			languages := make([]string, 0, len(stats))
+			for _, s := range stats {
+				languages = append(languages, s.Language)
+			}
+			assert.Equal(t, c.ExpectedLanguageOrder, languages, "case %d: unexpected language stats", i)
+
+			assert.NoError(t, repo_service.DeleteRepository(db.DefaultContext, user, repo, false))
+		}
+	})
+}