Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 21ed4fd

Browse files
zeripathgwymorsilverwindwxiaoguang
authoredJan 7, 2022
Add warning for BIDI characters in page renders and in diffs (#17562)
Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
1 parent ee60f27 commit 21ed4fd

26 files changed

+809
-87
lines changed
 

‎modules/charset/escape.go

+230
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
// Copyright 2021 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package charset
6+
7+
import (
8+
"bytes"
9+
"fmt"
10+
"io"
11+
"strings"
12+
"unicode"
13+
"unicode/utf8"
14+
15+
"golang.org/x/text/unicode/bidi"
16+
)
17+
18+
// EscapeStatus represents the findings of the unicode escaper
19+
type EscapeStatus struct {
20+
Escaped bool
21+
HasError bool
22+
HasBadRunes bool
23+
HasControls bool
24+
HasSpaces bool
25+
HasMarks bool
26+
HasBIDI bool
27+
BadBIDI bool
28+
HasRTLScript bool
29+
HasLTRScript bool
30+
}
31+
32+
// Or combines two EscapeStatus structs into one representing the conjunction of the two
33+
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
34+
st := status
35+
st.Escaped = st.Escaped || other.Escaped
36+
st.HasError = st.HasError || other.HasError
37+
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
38+
st.HasControls = st.HasControls || other.HasControls
39+
st.HasSpaces = st.HasSpaces || other.HasSpaces
40+
st.HasMarks = st.HasMarks || other.HasMarks
41+
st.HasBIDI = st.HasBIDI || other.HasBIDI
42+
st.BadBIDI = st.BadBIDI || other.BadBIDI
43+
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
44+
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
45+
return st
46+
}
47+
48+
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
49+
func EscapeControlString(text string) (EscapeStatus, string) {
50+
sb := &strings.Builder{}
51+
escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
52+
return escaped, sb.String()
53+
}
54+
55+
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
56+
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
57+
buf := &bytes.Buffer{}
58+
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
59+
return escaped, buf.Bytes()
60+
}
61+
62+
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
63+
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
64+
buf := make([]byte, 4096)
65+
readStart := 0
66+
var n int
67+
var writePos int
68+
69+
lineHasBIDI := false
70+
lineHasRTLScript := false
71+
lineHasLTRScript := false
72+
73+
readingloop:
74+
for err == nil {
75+
n, err = text.Read(buf[readStart:])
76+
bs := buf[:n+readStart]
77+
i := 0
78+
79+
for i < len(bs) {
80+
r, size := utf8.DecodeRune(bs[i:])
81+
// Now handle the codepoints
82+
switch {
83+
case r == utf8.RuneError:
84+
if writePos < i {
85+
if _, err = output.Write(bs[writePos:i]); err != nil {
86+
escaped.HasError = true
87+
return
88+
}
89+
writePos = i
90+
}
91+
// runes can be at most 4 bytes - so...
92+
if len(bs)-i <= 3 {
93+
// if not request more data
94+
copy(buf, bs[i:])
95+
readStart = n - i
96+
writePos = 0
97+
continue readingloop
98+
}
99+
// this is a real broken rune
100+
escaped.HasBadRunes = true
101+
escaped.Escaped = true
102+
if err = writeBroken(output, bs[i:i+size]); err != nil {
103+
escaped.HasError = true
104+
return
105+
}
106+
writePos += size
107+
case r == '\n':
108+
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
109+
escaped.BadBIDI = true
110+
}
111+
lineHasBIDI = false
112+
lineHasRTLScript = false
113+
lineHasLTRScript = false
114+
115+
case r == '\r' || r == '\t' || r == ' ':
116+
// These are acceptable control characters and space characters
117+
case unicode.IsSpace(r):
118+
escaped.HasSpaces = true
119+
escaped.Escaped = true
120+
if writePos < i {
121+
if _, err = output.Write(bs[writePos:i]); err != nil {
122+
escaped.HasError = true
123+
return
124+
}
125+
}
126+
if err = writeEscaped(output, r); err != nil {
127+
escaped.HasError = true
128+
return
129+
}
130+
writePos = i + size
131+
case unicode.Is(unicode.Bidi_Control, r):
132+
escaped.Escaped = true
133+
escaped.HasBIDI = true
134+
if writePos < i {
135+
if _, err = output.Write(bs[writePos:i]); err != nil {
136+
escaped.HasError = true
137+
return
138+
}
139+
}
140+
lineHasBIDI = true
141+
if err = writeEscaped(output, r); err != nil {
142+
escaped.HasError = true
143+
return
144+
}
145+
writePos = i + size
146+
case unicode.Is(unicode.C, r):
147+
escaped.Escaped = true
148+
escaped.HasControls = true
149+
if writePos < i {
150+
if _, err = output.Write(bs[writePos:i]); err != nil {
151+
escaped.HasError = true
152+
return
153+
}
154+
}
155+
if err = writeEscaped(output, r); err != nil {
156+
escaped.HasError = true
157+
return
158+
}
159+
writePos = i + size
160+
case unicode.Is(unicode.M, r):
161+
escaped.Escaped = true
162+
escaped.HasMarks = true
163+
if writePos < i {
164+
if _, err = output.Write(bs[writePos:i]); err != nil {
165+
escaped.HasError = true
166+
return
167+
}
168+
}
169+
if err = writeEscaped(output, r); err != nil {
170+
escaped.HasError = true
171+
return
172+
}
173+
writePos = i + size
174+
default:
175+
p, _ := bidi.Lookup(bs[i : i+size])
176+
c := p.Class()
177+
if c == bidi.R || c == bidi.AL {
178+
lineHasRTLScript = true
179+
escaped.HasRTLScript = true
180+
} else if c == bidi.L {
181+
lineHasLTRScript = true
182+
escaped.HasLTRScript = true
183+
}
184+
}
185+
i += size
186+
}
187+
if n > 0 {
188+
// we read something...
189+
// write everything unwritten
190+
if writePos < i {
191+
if _, err = output.Write(bs[writePos:i]); err != nil {
192+
escaped.HasError = true
193+
return
194+
}
195+
}
196+
197+
// reset the starting positions for the next read
198+
readStart = 0
199+
writePos = 0
200+
}
201+
}
202+
if readStart > 0 {
203+
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
204+
escaped.Escaped = true
205+
escaped.HasBadRunes = true
206+
if err = writeBroken(output, buf[:readStart]); err != nil {
207+
escaped.HasError = true
208+
return
209+
}
210+
}
211+
if err == io.EOF {
212+
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
213+
escaped.BadBIDI = true
214+
}
215+
err = nil
216+
return
217+
}
218+
escaped.HasError = true
219+
return
220+
}
221+
222+
func writeBroken(output io.Writer, bs []byte) (err error) {
223+
_, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
224+
return
225+
}
226+
227+
func writeEscaped(output io.Writer, r rune) (err error) {
228+
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
229+
return
230+
}

0 commit comments

Comments
 (0)
Please sign in to comment.