Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't treat BOM escape sequence as hidden character. #18909

Merged
merged 11 commits into from
Feb 26, 2022
5 changes: 5 additions & 0 deletions modules/charset/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
buf := make([]byte, 4096)
readStart := 0
runeCount := 0
var n int
var writePos int

Expand All @@ -79,6 +80,8 @@ readingloop:

for i < len(bs) {
r, size := utf8.DecodeRune(bs[i:])
runeCount++

// Now handle the codepoints
switch {
case r == utf8.RuneError:
Expand Down Expand Up @@ -113,6 +116,8 @@ readingloop:
lineHasRTLScript = false
lineHasLTRScript = false

case runeCount == 1 && r == 0xFEFF: // UTF BOM
// the first BOM is safe
case r == '\r' || r == '\t' || r == ' ':
// These are acceptable control characters and space characters
case unicode.IsSpace(r):
Expand Down
24 changes: 20 additions & 4 deletions modules/charset/escape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
"\n" + `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {` + "\n",
status: EscapeStatus{Escaped: true, HasBIDI: true, BadBIDI: true, HasLTRScript: true, HasRTLScript: true},
},
{
// UTF-8/16/32 all use the same codepoint for BOM
// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
name: "UTF BOM",
text: "\xef\xbb\xbftest",
result: "\xef\xbb\xbftest",
status: EscapeStatus{HasLTRScript: true},
},
}

func TestEscapeControlString(t *testing.T) {
Expand Down Expand Up @@ -163,19 +171,27 @@ func TestEscapeControlReader(t *testing.T) {
// lets add some control characters to the tests
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
copy(tests, escapeControlTests)

// if there is a BOM, we should keep the BOM
addPrefix := func(prefix, s string) string {
if strings.HasPrefix(s, "\xef\xbb\xbf") {
return s[:3] + prefix + s[3:]
}
return prefix + s
}
for _, test := range escapeControlTests {
test.name += " (+Control)"
test.text = "\u001E" + test.text
test.result = `<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">` + "\u001e" + `</span></span>` + test.result
test.text = addPrefix("\u001E", test.text)
test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">`+"\u001e"+`</span></span>`, test.result)
test.status.Escaped = true
test.status.HasControls = true
tests = append(tests, test)
}

for _, test := range escapeControlTests {
test.name += " (+Mark)"
test.text = "\u0300" + test.text
test.result = `<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">` + "\u0300" + `</span></span>` + test.result
test.text = addPrefix("\u0300", test.text)
test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">`+"\u0300"+`</span></span>`, test.result)
test.status.Escaped = true
test.status.HasMarks = true
tests = append(tests, test)
Expand Down