-
Notifications
You must be signed in to change notification settings - Fork 55
/
utf8.go
57 lines (52 loc) · 2.16 KB
/
utf8.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package transformer
import (
"unicode/utf8"
"github.com/pkg/errors"
)
// False if the code point is known to cause parse errors during HTML
// preprocessing, per
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
//
// Also false for U+0000 NULL, as that causes parse errors everywhere except
// CDATA, and for defense in depth we don't assume that all parsers interpret
// this properly.
func isHTMLValidInternal(r rune) bool {
// In order to reduce the average number of comparisons per code point,
// test for validity (OR of ANDs) rather than invalidity (AND of ORs),
// and check popular ranges first.
return (
// Invalid chars:
// U+0000 NULL, per above logic.
// U+0001 through U+001F, except 0x9, 0xA, 0xC, 0xD, per https://infra.spec.whatwg.org/#control.
(r > 0x1F && r < 0x7F) || r == 0x9 || r == 0xA || r == 0xC || r == 0xD ||
// U+007F through U+009F, per https://infra.spec.whatwg.org/#control.
(r > 0x9F && r < 0xD800) ||
// U+D800 through U+DFFF, per https://infra.spec.whatwg.org/#surrogate.
(r > 0xDFFF && r < 0xFDD0) ||
// U+FDD0 through U+FDEF, per https://infra.spec.whatwg.org/#noncharacter.
(r > 0xFDEF && r < 0xFFFE) ||
// U+??FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter.
(r > 0xFFFF && r < 0x10FFFE && r & 0xFFFE != 0xFFFE))
// There are no codepoints greater than U+10FFFF.
}
// Overrideable for test.
var isHTMLValid = isHTMLValidInternal
// Returns an error if the given string is not well-formed UTF-8, or contains
// characters known to cause parse errors in HTML. This requirement is imposed
// by the AMPHTML validator, so it doesn't make sense to create a SXG.
func validateUTF8ForHTML(html string) error {
pos := 0
for pos < len(html) {
r, width := utf8.DecodeRuneInString(html[pos:])
// Check that the code point wasn't ill-formed. utf8.RuneError
// == '\uFFFD' so we need to check for a mismatched width, too.
if r == utf8.RuneError && width < 2 {
return errors.Errorf("invalid UTF-8 at byte position %d", pos)
}
if !isHTMLValid(r) {
return errors.Errorf("character U+%04x at position %d is not allowed in AMPHTML", r, pos)
}
pos += width
}
return nil
}