Skip to content

Commit 507d206

Browse files
committed
make overlong Chars !isvalid; throw error on UInt32 of overlong Char
part of #25452
1 parent b72d9eb commit 507d206

File tree

4 files changed

+44
-11
lines changed

4 files changed

+44
-11
lines changed

Diff for: base/char.jl

+25-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
struct MalformedCharError <: Exception
3+
struct InvalidCharError <: Exception
44
char::Char
55
end
66
struct CodePointError <: Exception
77
code::Integer
88
end
9-
@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
9+
@noinline invalid_char(c::Char) = throw(InvalidCharError(c))
1010
@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
1111

1212
function ismalformed(c::Char)
@@ -17,20 +17,32 @@ function ismalformed(c::Char)
1717
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0)
1818
end
1919

20+
@inline is_overlong_enc(u::UInt32) = (u >> 24 == 0xc0) | (u >> 24 == 0xc1) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
21+
2022
function isoverlong(c::Char)
2123
u = reinterpret(UInt32, c)
22-
(u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
24+
is_overlong_enc(u)
2325
end
2426

2527
function UInt32(c::Char)
2628
# TODO: use optimized inline LLVM
2729
u = reinterpret(UInt32, c)
28-
u < 0x80000000 && return reinterpret(UInt32, u >> 24)
30+
u < 0x80000000 && return u >> 24
2931
l1 = leading_ones(u)
3032
t0 = trailing_zeros(u) & 56
3133
(l1 == 1) | (8l1 + t0 > 32) |
32-
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0) &&
33-
malformed_char(c)::Union{}
34+
((((u & 0x00c0c0c0) 0x00808080) >> t0 != 0) | is_overlong_enc(u)) &&
35+
invalid_char(c)::Union{}
36+
u &= 0xffffffff >> l1
37+
u >>= t0
38+
(u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
39+
(u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
40+
end
41+
42+
function decode_overlong(c::Char)
43+
u = reinterpret(UInt32, c)
44+
l1 = leading_ones(u)
45+
t0 = trailing_zeros(u) & 56
3446
u &= 0xffffffff >> l1
3547
u >>= t0
3648
(u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
@@ -145,8 +157,13 @@ function show(io::IO, ::MIME"text/plain", c::Char)
145157
show(io, c)
146158
if !ismalformed(c)
147159
print(io, ": ")
148-
isoverlong(c) && print(io, "[overlong] ")
149-
u = UInt32(c)
160+
if isoverlong(c)
161+
print(io, "[overlong] ")
162+
u = decode_overlong(c)
163+
c = Char(u)
164+
else
165+
u = UInt32(c)
166+
end
150167
h = hex(u, u 0xffff ? 4 : 6)
151168
print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
152169
else

Diff for: base/strings/unicode.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
module Unicode
55

66
import Base: show, ==, hash, string, Symbol, isless, length, eltype, start,
7-
next, done, convert, isvalid, MalformedCharError, ismalformed
7+
next, done, convert, isvalid, ismalformed, isoverlong
88

99
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
1010

@@ -43,7 +43,7 @@ true
4343
"""
4444
isvalid(T,value)
4545

46-
isvalid(c::Char) = !ismalformed(c) & ((c '\ud7ff') | ('\ue000' c) & (c '\U10ffff'))
46+
isvalid(c::Char) = !ismalformed(c) & !isoverlong(c) & ((c '\ud7ff') | ('\ue000' c) & (c '\U10ffff'))
4747
isvalid(::Type{Char}, c::Unsigned) = ((c 0xd7ff ) | ( 0xe000 c) & (c 0x10ffff ))
4848
isvalid(::Type{Char}, c::Integer) = isvalid(Char, Unsigned(c))
4949
isvalid(::Type{Char}, c::Char) = isvalid(c)

Diff for: test/char.jl

+5-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,11 @@ end
221221
end
222222

223223
function test_overlong(c::Char, n::Integer, rep::String)
224-
@test Int(c) == n
224+
if isvalid(c)
225+
@test Int(c) == n
226+
else
227+
@test_throws Base.InvalidCharError UInt32(c)
228+
end
225229
@test sprint(show, c) == rep
226230
end
227231

Diff for: test/strings/basic.jl

+12
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ end
362362
("\udc00\ud800", false),
363363
)
364364
@test isvalid(String, val) == pass == isvalid(String(val))
365+
@test isvalid(val[1]) == pass
365366
end
366367

367368
# Issue #11203
@@ -435,6 +436,17 @@ end
435436
end
436437
# Check seven-byte sequences, should be invalid
437438
@test isvalid(String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
439+
440+
# invalid Chars
441+
@test isvalid('a')
442+
@test isvalid('')
443+
@test !isvalid("\xff"[1])
444+
@test !isvalid("\xc0\x80"[1])
445+
@test !isvalid("\xf0\x80\x80\x80"[1])
446+
@test !isvalid('\ud800')
447+
@test isvalid('\ud7ff')
448+
@test !isvalid('\udfff')
449+
@test isvalid('\ue000')
438450
end
439451

440452
@testset "NULL pointers are handled consistently by String" begin

0 commit comments

Comments
 (0)