Skip to content
3 changes: 3 additions & 0 deletions modules/base/tool.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ import (
"github.com/gogits/chardet"
)

// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

// EncodeMD5 encodes string to md5 hex value.
func EncodeMD5(str string) string {
m := md5.New()
Expand Down
62 changes: 62 additions & 0 deletions modules/repofiles/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@
package repofiles

import (
"bytes"
"fmt"
"path"
"strings"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/sdk/gitea"
)
Expand All @@ -37,6 +43,41 @@ type UpdateRepoFileOptions struct {
Committer *IdentityOptions
}

func detectEncodingAndBOM(entry *git.TreeEntry) (string, bool) {
reader, err := entry.Blob().DataAsync()
if err != nil {
// just default to utf-8 and no bom
return "UTF-8", false
}
buf := make([]byte, 1024)
n, err := reader.Read(buf)
if err != nil {
// just default to utf-8 and no bom
return "UTF-8", false
}
buf = buf[:n]
encoding, err := base.DetectEncoding(buf)
if err != nil {
// just default to utf-8 and no bom
return "UTF-8", false
}
if encoding == "UTF-8" {
return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
}
charsetEncoding, _ := charset.Lookup(encoding)
if charsetEncoding == nil {
return "UTF-8", false
}

result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))

if n > 2 {
return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
}

return encoding, false
}

// CreateOrUpdateRepoFile adds or updates a file in the given repository
func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) {
// If no branch name is set, assume master
Expand Down Expand Up @@ -118,6 +159,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
opts.LastCommitID = commit.ID.String()
}

encoding := "UTF-8"
bom := false

if !opts.IsNewFile {
fromEntry, err := commit.GetTreeEntryByPath(fromTreePath)
if err != nil {
Expand Down Expand Up @@ -151,6 +195,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
// haven't been made. We throw an error if one wasn't provided.
return nil, models.ErrSHAOrCommitIDNotProvided{}
}
encoding, bom = detectEncodingAndBOM(fromEntry)
}

// For the path where this file will be created/updated, we need to make
Expand Down Expand Up @@ -235,6 +280,23 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
}

content := opts.Content
if bom {
content = string(base.UTF8BOM) + content
}
if encoding != "UTF-8" {
charsetEncoding, _ := charset.Lookup(encoding)
if charsetEncoding != nil {
result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
if err != nil {
// Look if we can't encode back in to the original we should just stick with utf-8
log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err)
result = content
}
content = result
} else {
log.Error("Unknown encoding: %s", encoding)
}
}
var lfsMetaObject *models.LFSMetaObject

if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
Expand Down
18 changes: 18 additions & 0 deletions modules/templates/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ func ToUTF8WithErr(content []byte) (string, error) {
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about to create a function named RemoveUTF8BOM(content string) string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

log.Debug("Removing BOM from UTF-8 string")
return string(content[3:]), nil
}
return string(content), nil
}

Expand All @@ -282,13 +286,22 @@ func ToUTF8WithErr(content []byte) (string, error) {
result = result + string(content[n:])
}

if len(result) > 2 && bytes.Equal([]byte(result[0:3]), base.UTF8BOM) {
log.Debug("Removing BOM from decoded string")
result = result[3:]
}

return result, err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
if len(content) > 2 && bytes.Equal(content[0:3], base.UTF8BOM) {
log.Debug("Removing BOM from UTF-8 string")
return content[3:]
}
return content
}

Expand All @@ -304,6 +317,11 @@ func ToUTF8WithFallback(content []byte) []byte {
return append(result, content[n:]...)
}

if len(result) > 2 && bytes.Equal(result[0:3], base.UTF8BOM) {
log.Debug("Removing BOM from decoded string")
result = result[3:]
}

return result
}

Expand Down