Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning for BIDI characters in page renders and in diffs #17562

Merged
merged 47 commits into from
Jan 7, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
dca05ee
Add warning for BIDI characters in page renders and in diffs
zeripath Nov 5, 2021
449cb26
as per review
zeripath Nov 5, 2021
40b8628
Adjust to only put the warning on BIDI lines without RTL chars
zeripath Nov 6, 2021
3a63d9d
Another attempt.
zeripath Nov 7, 2021
cd0bb29
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 9, 2021
5f481cf
placate lint
zeripath Nov 9, 2021
c89c678
another placation
zeripath Nov 9, 2021
7e9871c
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 10, 2021
f563ee9
as per review
zeripath Nov 14, 2021
70d446b
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 14, 2021
65dcc39
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 16, 2021
62345ba
fix broken merge
zeripath Nov 16, 2021
831f189
as per silverwind
zeripath Nov 16, 2021
5a9759c
as per silverwind
zeripath Nov 16, 2021
006a5cd
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 16, 2021
63a5e0f
fix class
silverwind Nov 16, 2021
8a01b22
make message header colors work on both themes
silverwind Nov 16, 2021
6449cad
minor styling tweaks
silverwind Nov 16, 2021
ab03673
fix border-radius on unescape button
silverwind Nov 16, 2021
06b4146
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 18, 2021
b93d0bf
drop buttons as per silverwind
zeripath Nov 18, 2021
cf04f2e
as per fnetx
zeripath Nov 18, 2021
aa4fc5a
hide the unescape button in the wiki
zeripath Nov 18, 2021
62f557d
add warning triangles to view and blame
zeripath Nov 18, 2021
b6ba958
Add warning triangles to diff
zeripath Nov 18, 2021
ea7a04a
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 20, 2021
36dd4bf
Merge branch 'main' into fix-17514-add-warning-bidi-characters
zeripath Nov 21, 2021
19aed47
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Nov 29, 2021
6a2e274
ensure buttons work on loaded diffs
zeripath Nov 29, 2021
0d6e8f6
move escape functions into their own files
zeripath Nov 29, 2021
cb7d19d
extract out functions
zeripath Nov 29, 2021
c55394d
Apply suggestions from code review
zeripath Nov 29, 2021
ae19a60
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Dec 1, 2021
c11bd34
Update options/locale/locale_en-US.ini
zeripath Dec 8, 2021
58a4fcc
move warning triangle to another column
zeripath Dec 8, 2021
3f6057e
Merge remote-tracking branch 'origin/main' into fix-17514-add-warning…
zeripath Dec 8, 2021
67d00b5
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 4, 2022
51a1bf1
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 5, 2022
d8ab670
Merge branch 'master' into fix-17514-add-warning-bidi-characters
6543 Jan 6, 2022
0fc5af7
linter ignore bool "suspicious assignment to a by-value method receiv…
6543 Jan 6, 2022
1dc8a21
fix lint
wxiaoguang Jan 6, 2022
6f99bfd
refactoring
wxiaoguang Jan 6, 2022
ab6db78
refactor
wxiaoguang Jan 6, 2022
4e1b449
Apply suggestions from code review
zeripath Jan 6, 2022
aac0e1d
Merge pull request #10 from wxiaoguang/fix-17514-add-warning-bidi-cha…
zeripath Jan 6, 2022
f66923f
Merge branch 'main' into fix-17514-add-warning-bidi-characters
6543 Jan 6, 2022
a28bbbc
Merge branch 'main' into fix-17514-add-warning-bidi-characters
wxiaoguang Jan 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions modules/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"io"
"strings"
"unicode"
"unicode/utf8"

"code.gitea.io/gitea/modules/log"
Expand All @@ -18,11 +19,215 @@ import (
"github.com/gogs/chardet"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/bidi"
)

// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

// EscapeStatus represents the findings of the unicode escaper
type EscapeStatus struct {
Escaped bool
HasError bool
HasBadRunes bool
HasControls bool
HasSpaces bool
HasMarks bool
HasBIDI bool
BadBIDI bool
HasRTLScript bool
HasLTRScript bool
}

// Or combines two EscapeStatus structs into one representing the conjunction of the two
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
status.Escaped = status.Escaped || other.Escaped
status.HasError = status.HasError || other.HasError
status.HasBadRunes = status.HasBadRunes || other.HasBadRunes
status.HasControls = status.HasControls || other.HasControls
status.HasSpaces = status.HasSpaces || other.HasSpaces
status.HasMarks = status.HasMarks || other.HasMarks
status.HasBIDI = status.HasBIDI || other.HasBIDI
status.BadBIDI = status.BadBIDI || other.BadBIDI
status.HasRTLScript = status.HasRTLScript || other.HasRTLScript
status.HasLTRScript = status.HasLTRScript || other.HasLTRScript
return status
}

// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString(text string) (EscapeStatus, string) {
sb := &strings.Builder{}
escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
return escaped, sb.String()
}

// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
buf := &bytes.Buffer{}
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
return escaped, buf.Bytes()
}

// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
buf := make([]byte, 4096)
readStart := 0
var n int
var writePos int

lineHasBIDI := false
lineHasRTLScript := false
lineHasLTRScript := false

readingloop:
for err == nil {
n, err = text.Read(buf[readStart:])
wxiaoguang marked this conversation as resolved.
Show resolved Hide resolved
bs := buf[:n+readStart]
i := 0

for i < len(bs) {
r, size := utf8.DecodeRune(bs[i:])
// Now handle the codepoints
switch {
case r == utf8.RuneError:
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
writePos = i
}
// runes can be at most 4 bytes - so...
if len(bs)-i <= 3 {
// if not request more data
copy(buf, bs[i:])
readStart = n - i
writePos = 0
continue readingloop
}
// this is a real broken rune
escaped.HasBadRunes = true
escaped.Escaped = true
if _, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs[i:i+size]); err != nil {
escaped.HasError = true
return
}
writePos += size
case r == '\n':
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
lineHasBIDI = false
lineHasRTLScript = false
lineHasLTRScript = false

case r == '\r' || r == '\t' || r == ' ':
// These are acceptable control characters and space characters
case unicode.IsSpace(r):
escaped.HasSpaces = true
escaped.Escaped = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.Bidi_Control, r):
escaped.Escaped = true
escaped.HasBIDI = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
lineHasBIDI = true
if _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.C, r):
escaped.Escaped = true
escaped.HasControls = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.M, r):
escaped.Escaped = true
escaped.HasMarks = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
default:
p, _ := bidi.Lookup(bs[i : i+size])
c := p.Class()
if c == bidi.R || c == bidi.AL {
lineHasRTLScript = true
escaped.HasRTLScript = true
} else if c == bidi.L {
lineHasLTRScript = true
escaped.HasLTRScript = true
}
}
i += size
}
if n > 0 {
// we read something...
// write everything unwritten
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}

// reset the starting positions for the next read
readStart = 0
writePos = 0
}
}
if readStart > 0 {
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
escaped.Escaped = true
escaped.HasBadRunes = true
if _, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, buf[:readStart]); err != nil {
escaped.HasError = true
return
}
}
if err == io.EOF {
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
err = nil
return
}
escaped.HasError = true
return
}

// ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible
func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {
var buf = make([]byte, 2048)
Expand Down
Loading