-
Notifications
You must be signed in to change notification settings - Fork 141
/
encoding.go
113 lines (95 loc) · 2.29 KB
/
encoding.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package the_platinum_searcher
const (
UNKNOWN = iota
ERROR
BINARY
ASCII
UTF8
EUCJP
SHIFTJIS
)
func detectEncoding(bs []byte) int {
var (
suspiciousBytes = 0
likelyUtf8 = 0
likelyEucjp = 0
likelyShiftjis = 0
)
length := len(bs)
if length == 0 {
return ASCII
}
if length >= 3 && bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF {
// UTF-8 BOM. This isn't binary.
return UTF8
}
if length >= 5 && bs[0] == 0x25 && bs[1] == 0x50 && bs[2] == 0x44 && bs[3] == 0x46 && bs[4] == 0x2D {
/* %PDF-. This is binary. */
return BINARY
}
for i := 0; i < length; i++ {
if bs[i] == 0x00 {
/* NULL char. It's binary */
return BINARY
} else if (bs[i] < 7 || bs[i] > 14) && (bs[i] < 32 || bs[i] > 127) {
/* UTF-8 detection */
if bs[i] > 193 && bs[i] < 224 && i+1 < length {
i++
if bs[i] > 127 && bs[i] < 192 {
likelyUtf8++
continue
}
} else if bs[i] > 223 && bs[i] < 240 && i+2 < length {
i++
if bs[i] > 127 && bs[i] < 192 && bs[i+1] > 127 && bs[i+1] < 192 {
i++
likelyUtf8++
continue
}
}
/* EUC-JP detection */
if bs[i] == 142 && i+1 < length {
i++
if bs[i] > 160 && bs[i] < 224 {
likelyEucjp++
continue
}
} else if bs[i] > 160 && bs[i] < 255 && i+1 < length {
i++
if bs[i] > 160 && bs[i] < 255 {
likelyEucjp++
continue
}
}
/* Shift-JIS detection */
if bs[i] > 160 && bs[i] < 224 {
likelyShiftjis++
continue
} else if ((bs[i] > 128 && bs[i] < 160) || (bs[i] > 223 && bs[i] < 240)) && i+1 < length {
i++
if (bs[i] > 63 && bs[i] < 127) || (bs[i] > 127 && bs[i] < 253) {
likelyShiftjis++
continue
}
}
suspiciousBytes++
if i >= 32 && (suspiciousBytes*100)/length > 10 {
return BINARY
}
}
}
if (suspiciousBytes*100)/length > 10 {
return BINARY
}
// fmt.Printf("Detected points[utf8/eucjp/shiftjis] is %d/%d/%d.\n", likelyUtf8, likelyEucjp, likelyShiftjis)
if likelyUtf8 == 0 && likelyEucjp == 0 && likelyShiftjis == 0 {
return ASCII
} else if likelyUtf8 >= likelyEucjp && likelyUtf8 >= likelyShiftjis {
return UTF8
} else if likelyEucjp >= likelyUtf8 && likelyEucjp >= likelyShiftjis {
return EUCJP
} else if likelyShiftjis >= likelyUtf8 && likelyShiftjis >= likelyEucjp {
return SHIFTJIS
}
return ASCII
}