Skip to content

Commit

Permalink
Fix JuliaLang#10959 UTF-32 conversion errors
Browse files Browse the repository at this point in the history
Added new convert methods that use the check_string function to validate input
Added tests for many sorts of valid/invalid data
Depends on PR JuliaLang#11551 and JuliaLang#11575
  • Loading branch information
ScottPJones committed Jun 7, 2015
1 parent ce5e7c8 commit 1985663
Show file tree
Hide file tree
Showing 2 changed files with 275 additions and 16 deletions.
258 changes: 250 additions & 8 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,258 @@ utf32(x) = convert(UTF32String, x)
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s

function convert(::Type{UTF32String}, s::AbstractString)
a = Array(Char, length(s) + 1)
i = 0
for c in s
a[i += 1] = c
function convert(::Type{UTF32String}, str::AbstractString)
"
Converts an AbstractString to a UTF16String
### Input Arguments:
* ::Type{UTF32String}
* str::AbstractString
### Returns:
* ::UTF32String
### Throws:
* UnicodeError
"
len, flags = check_string(str)
buf = Vector{Char}(len+1)
out = 0
@inbounds for ch in str ; buf[out += 1] = ch ; end
@inbounds buf[out + 1] = 0 # NULL termination
UTF32String(buf)
end

function convert(::Type{UTF8String}, dat::Vector{UInt32})
"
Converts a UTF-32 encoded vector of UInt32 to a UTF8String
### Input Arguments:
* ::Type{UTF8String}
* dat::Vector{UInt32}
### Returns:
* ::UTF8String
### Throws:
* UnicodeError
"
len = sizeof(dat)
# handle zero length string quickly
len == 0 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

function convert(::Type{UTF8String}, str::UTF32String)
"
Converts a UTF32String to a UTF8String
### Input Arguments:
* ::Type{UTF8String}
* str::UTF32String
### Returns:
* ::UTF8String
### Throws:
* UnicodeError
"
dat = reinterpret(UInt32, str.data)
len = sizeof(dat) >>> 2
# handle zero length string quickly
len <= 1 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

function convert(::Type{UTF32String}, str::UTF8String)
"
Converts a UTF8String to a UTF32String
### Input Arguments:
* ::Type{UTF32String}
* str::UTF8String
### Returns:
* ::UTF32String
### Throws:
* UnicodeError
"
dat = str.data
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf32
# Validate UTF-8 encoding, and get number of words to create
len, flags = check_string(dat)
# Optimize case where no characters > 0x7f
totlen = len+1
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
# has multi-byte UTF-8 sequences
buf = Vector{Char}(totlen)
@inbounds buf[totlen] = 0 # NULL termination
local ch::UInt32, surr::UInt32
out = 0
pos = 0
@inbounds while out < len
ch = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle range 0x80-0x7ff
elseif ch < 0xe0
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
# Handle range 0x800-0xffff
elseif ch < 0xf0
pos += 2
ch = get_utf8_3byte(dat, pos, ch)
# Handle surrogate pairs (should have been encoded in 4 bytes)
if is_surrogate_lead(ch)
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
pos += 3
surr = ((UInt32(dat[pos-2] & 0xf) << 12)
| (UInt32(dat[pos-1] & 0x3f) << 6)
| (dat[pos] & 0x3f))
ch = get_supplementary(ch, surr)
end
buf[out += 1] = ch
# Handle range 0x10000-0x10ffff
else
pos += 3
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
end
end
UTF32String(buf)
end

function convert(::Type{UTF32String}, str::UTF16String)
"
Converts a UTF16String to UTF32String
### Input Arguments:
* ::Type{UTF32String}
* str::UTF16String
### Returns:
* ::UTF32String
### Throws:
* UnicodeError
"
dat = str.data
len = sizeof(dat)
# handle zero length string quickly (account for trailing \0)
len <= 2 && return empty_utf32
# get number of words to create
len, flags, num4byte = check_string(dat, len>>>1)
# No surrogate pairs, do optimized copy
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
local ch::UInt32
buf = Vector{Char}(len)
out = 0
pos = 0
@inbounds while out < len
ch = dat[pos += 1]
# check for surrogate pair
if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
buf[out += 1] = ch
end
a[end] = Char(0) # NULL terminate
UTF32String(a)
UTF32String(buf)
end

function convert(::Type{UTF16String}, dat::Vector{UInt32})
"
Converts a UTF-32 encoded vector of UInt32 to a UTF16String
### Input Arguments:
* ::Type{UTF16String}
* dat::Vector{UInt32}
### Returns:
* ::UTF16String
### Throws:
* UnicodeError
"
len = sizeof(dat)
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string(dat, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
return encode_to_utf16(dat, len)
end

function convert(::Type{UTF16String}, str::UTF32String)
"
Converts a UTF32String to UTF16String
### Input Arguments:
* ::Type{UTF16String}
* str::UTF32String
### Returns:
* ::UTF16String
### Throws:
* UnicodeError
"
dat = reinterpret(UInt32, str.data)
len = sizeof(dat)
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string(dat, len>>>2)
# optimized path, no surrogates
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
return encode_to_utf16(dat, len + num4byte)
end

function encode_to_utf16(dat, len)
"
Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
### Input Arguments:
* dat::Vector{UInt32} UTF-32 encoded data
* len length of output in 16-bit words
### Returns:
* ::UTF16String
"
buf = Vector{UInt16}(len)
@inbounds buf[len] = 0 # NULL termination
out = 0
pos = 0
@inbounds while out < len
ch = UInt32(dat[pos += 1])
if ch > 0xffff
# Output surrogate pair for 0x10000-0x10ffff
buf[out += 1] = 0xd7c0 + (ch >>> 10)
ch = 0xdc00 + (ch & 0x3ff)
end
buf[out += 1] = ch
end
UTF16String(buf)
end

convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat))

convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])

function convert(::Type{UTF32String}, str::ASCIIString)
dat = str.data
fast_utf_copy(UTF32String, Char, length(dat)+1, dat)
end

convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true)

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
d = Array(Char, len + 1)
Expand Down Expand Up @@ -51,7 +293,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
convert(Ptr{T}, pointer(s))

function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF32String(Char[0])
isempty(bytes) && return empty_utf32
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
data = reinterpret(Char, bytes)
# check for byte-order mark (BOM):
Expand Down
33 changes: 25 additions & 8 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1820,12 +1820,16 @@ byt = 0x0
@test_throws UnicodeError Base.check_string(UInt32[0x110000])

# issue #11551 (#11004,#10959)
function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String)
function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
@test utf16(strUTF8) == strUTF16
@test utf32(strUTF8) == strUTF32
@test utf8(strUTF16) == strUTF8
@test utf32(strUTF16) == strUTF32
@test utf8(strUTF32) == strUTF8
@test utf16(strUTF32) == strUTF16
end

# Create some ASCII, UTF8 and UTF16
# Create some ASCII, UTF8, UTF16, and UTF32 strings
strAscii = "abcdefgh"
strA_UTF8 = ("abcdefgh\uff")[1:8]
strL_UTF8 = "abcdef\uff\uff"
Expand All @@ -1844,27 +1848,40 @@ str3_UTF16 = utf16(str3_UTF8)
str4_UTF16 = utf16(str4_UTF8)
strS_UTF16 = utf16(strS_UTF8)

strA_UTF32 = utf32(strA_UTF8)
strL_UTF32 = utf32(strL_UTF8)
str2_UTF32 = utf32(str2_UTF8)
str3_UTF32 = utf32(str3_UTF8)
str4_UTF32 = utf32(str4_UTF8)
strS_UTF32 = utf32(strS_UTF8)

@test utf8(strAscii) == strAscii
@test utf16(strAscii) == strAscii
@test utf32(strAscii) == strAscii

tstcvt(strA_UTF8,strA_UTF16)
tstcvt(strL_UTF8,strL_UTF16)
tstcvt(str2_UTF8,str2_UTF16)
tstcvt(str3_UTF8,str3_UTF16)
tstcvt(str4_UTF8,str4_UTF16)
tstcvt(strA_UTF8,strA_UTF16,strA_UTF32)
tstcvt(strL_UTF8,strL_UTF16,strL_UTF32)
tstcvt(str2_UTF8,str2_UTF16,str2_UTF32)
tstcvt(str3_UTF8,str3_UTF16,str3_UTF32)
tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)

# Test converting surrogate pairs
@test utf16(strS_UTF8) == strC_UTF8
@test utf32(strS_UTF8) == strC_UTF8
@test utf8(strS_UTF16) == strC_UTF8
@test utf32(strS_UTF16) == strC_UTF8
@test utf8(strS_UTF32) == strC_UTF8
@test utf16(strS_UTF32) == strC_UTF8

# Test converting overlong \0
# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl)
@test utf16(strZ_UTF8) == strz_UTF8
@test utf32(strZ_UTF8) == strz_UTF8

# Test invalid sequences

byt = 0x0
for T in (UTF16String,) # UTF32String
for T in (UTF16String, UTF32String)
try
# Continuation byte not after lead
for byt in 0x80:0xbf
Expand Down

0 comments on commit 1985663

Please sign in to comment.