-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Commit
Fix #10959 bugs with UTF-16 conversions
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,42 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 | ||
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 | ||
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
tkelman
Author
Contributor
|
||
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail) | ||
# Quickly copy and set trailing \0 | ||
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}( | ||
::Type{S}, ::Type{T}, len, dat, flag::Bool=false) | ||
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) | ||
end | ||
|
||
# Get rest of character ch from 3-byte UTF-8 sequence in dat | ||
@inline function get_utf8_3byte(dat, pos, ch) | ||
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) | ||
end | ||
# Get rest of character ch from 4-byte UTF-8 sequence in dat | ||
@inline function get_utf8_4byte(dat, pos, ch) | ||
@inbounds return (((ch & 0x7) << 18) | ||
| (UInt32(dat[pos-2] & 0x3f) << 12) | ||
| (UInt32(dat[pos-1] & 0x3f) << 6) | ||
| (dat[pos] & 0x3f)) | ||
end | ||
|
||
# Output a character as a 4-byte UTF-8 sequence | ||
@inline function output_utf8_4byte!(buf, out, ch) | ||
@inbounds begin | ||
buf[out + 1] = 0xf0 | (ch >>> 18) | ||
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) | ||
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) | ||
buf[out + 4] = 0x80 | (ch & 0x3f) | ||
end | ||
end | ||
|
||
const empty_utf16 = UTF16String(UInt16[0]) | ||
|
||
function length(s::UTF16String) | ||
d = s.data | ||
len = length(d) - 1 | ||
len == 0 && return 0 | ||
cnum = 0 | ||
for i = 1:len | ||
@inbounds cnum += !utf16_is_trail(d[i]) | ||
@inbounds cnum += !is_surrogate_trail(d[i]) | ||
end | ||
cnum | ||
end | ||
|
@@ -20,100 +45,220 @@ function endof(s::UTF16String) | |
d = s.data | ||
i = length(d) - 1 | ||
i == 0 && return i | ||
utf16_is_surrogate(d[i]) ? i-1 : i | ||
return is_surrogate_codeunit(d[i]) ? i-1 : i | ||
end | ||
|
||
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) | ||
|
||
function next(s::UTF16String, i::Int) | ||
if !utf16_is_surrogate(s.data[i]) | ||
return Char(s.data[i]), i+1 | ||
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) | ||
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 | ||
end | ||
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0)) | ||
ch = s.data[i] | ||
!is_surrogate_codeunit(ch) && return (Char(ch), i+1) | ||
# check length, account for terminating \0 | ||
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))) | ||
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch)) | ||
ct = s.data[i+1] | ||
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch)) | ||
Char(get_supplementary(ch, ct)), i+2 | ||
end | ||
|
||
function reverseind(s::UTF16String, i::Integer) | ||
j = length(s.data) - i | ||
return Base.utf16_is_trail(s.data[j]) ? j-1 : j | ||
return is_surrogate_trail(s.data[j]) ? j-1 : j | ||
end | ||
|
||
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator | ||
|
||
function reverse(s::UTF16String) | ||
d =s.data | ||
d = s.data | ||
out = similar(d) | ||
out[end] = 0 # NULL termination | ||
n = length(d) | ||
for i = 1:n-1 | ||
out[i] = d[n-i] | ||
if Base.utf16_is_lead(out[i]) | ||
out[i],out[i-1] = out[i-1],out[i] | ||
@inbounds for i = 1:n-1 | ||
ch = d[n-i] | ||
if is_surrogate_lead(ch) | ||
out[i],out[i-1] = out[i-1],ch | ||
else | ||
out[i] = ch | ||
end | ||
end | ||
return UTF16String(out) | ||
UTF16String(out) | ||
end | ||
|
||
# TODO: optimize this | ||
function encode16(s::AbstractString) | ||
buf = UInt16[] | ||
for ch in s | ||
c = reinterpret(UInt32, ch) | ||
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) | ||
|
||
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | ||
i = 1 | ||
n = length(data) # this may include NULL termination; that's okay | ||
@inbounds while i < n # check for unpaired surrogates | ||
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) | ||
i += 2 | ||
elseif is_surrogate_codeunit(data[i]) | ||
return false | ||
else | ||
i += 1 | ||
end | ||
end | ||
return i > n || !is_surrogate_codeunit(data[i]) | ||
end | ||
|
||
" | ||
Converts an `AbstractString` to a `UTF16String` | ||
### Returns: | ||
* `UTF16String` | ||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF16String}, str::AbstractString) | ||
len, flags, num4byte = unsafe_checkstring(str) | ||
buf = Vector{UInt16}(len+num4byte+1) | ||
out = 0 | ||
@inbounds for ch in str | ||
c = UInt32(ch) | ||
if c < 0x10000 | ||
push!(buf, UInt16(c)) | ||
elseif c <= 0x10ffff | ||
push!(buf, UInt16(0xd7c0 + (c>>10))) | ||
push!(buf, UInt16(0xdc00 + (c & 0x3ff))) | ||
buf[out += 1] = UInt16(c) | ||
else | ||
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) | ||
# output surrogate pair | ||
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) | ||
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) | ||
end | ||
end | ||
push!(buf, 0) # NULL termination | ||
@inbounds buf[out + 1] = 0 # NULL termination | ||
UTF16String(buf) | ||
end | ||
|
||
utf16(x) = convert(UTF16String, x) | ||
convert(::Type{UTF16String}, s::UTF16String) = s | ||
convert(::Type{UTF16String}, s::AbstractString) = encode16(s) | ||
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data | ||
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data | ||
" | ||
Converts a `UTF8String` to a `UTF16String` | ||
# TODO: optimize this | ||
convert(::Type{UTF8String}, s::UTF16String) = | ||
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end) | ||
### Returns: | ||
* `UTF16String` | ||
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) | ||
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = | ||
convert(Ptr{T}, pointer(s)) | ||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF16String}, str::UTF8String) | ||
dat = str.data | ||
# handle zero length string quickly | ||
sizeof(dat) == 0 && return empty_utf16 | ||
# Check that is correct UTF-8 encoding and get number of words needed | ||
len, flags, num4byte = unsafe_checkstring(dat) | ||
len += num4byte | ||
buf = Vector{UInt16}(len+1) | ||
@inbounds buf[len+1] = 0 | ||
# Optimize case where no characters > 0x7f | ||
flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle range 0x80-0x7ff | ||
elseif ch < 0xe0 | ||
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) | ||
# Handle range 0x800-0xffff | ||
elseif ch < 0xf0 | ||
pos += 2 | ||
buf[out += 1] = get_utf8_3byte(dat, pos, ch) | ||
# Handle range 0x10000-0x10ffff | ||
else | ||
pos += 3 | ||
ch = get_utf8_4byte(dat, pos, ch) | ||
# output surrogate pair | ||
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) | ||
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) | ||
end | ||
end | ||
UTF16String(buf) | ||
end | ||
|
||
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | ||
i = 1 | ||
n = length(data) # this may include NULL termination; that's okay | ||
while i < n # check for unpaired surrogates | ||
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1]) | ||
i += 2 | ||
elseif utf16_is_surrogate(data[i]) | ||
return false | ||
" | ||
Converts a `UTF16String` to a `UTF8String` | ||
### Returns: | ||
* `UTF8String` | ||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF8String}, str::UTF16String) | ||
dat = str.data | ||
len = sizeof(dat) >>> 1 | ||
# handle zero length string quickly | ||
len <= 1 && return empty_utf8 | ||
# get number of bytes to allocate | ||
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) | ||
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) | ||
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) | ||
end | ||
|
||
" | ||
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` | ||
### Input Arguments: | ||
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted | ||
* `len` length of output in bytes | ||
### Returns: | ||
* `UTF8String` | ||
" | ||
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) | ||
buf = Vector{UInt8}(len) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle 0x80-0x7ff | ||
elseif ch < 0x800 | ||
buf[out += 1] = 0xc0 | (ch >>> 6) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
# Handle 0x10000-0x10ffff (if input is UInt32) | ||
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 | ||
output_utf8_4byte!(buf, out, ch) | ||
out += 4 | ||
# Handle surrogate pairs | ||
elseif is_surrogate_codeunit(ch) | ||
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) | ||
out += 4 | ||
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters | ||
else | ||
i += 1 | ||
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) | ||
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
end | ||
end | ||
return i > n || !utf16_is_surrogate(data[i]) | ||
UTF8String(buf) | ||
end | ||
|
||
function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) | ||
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) | ||
len = length(data) | ||
d = Array(UInt16, len + 1) | ||
d[end] = 0 # NULL terminate | ||
UTF16String(copy!(d,1, data,1, len)) | ||
function convert(::Type{UTF16String}, str::ASCIIString) | ||
dat = str.data | ||
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true) | ||
end | ||
|
||
convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data | ||
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data | ||
|
||
convert(::Type{UTF16String}, str::UTF16String) = str | ||
|
||
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = | ||
convert(Ptr{T}, pointer(s)) | ||
|
||
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = | ||
convert(T, reshape(data, length(data))) | ||
|
||
convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = | ||
convert(T, reinterpret(UInt16, data)) | ||
|
||
function convert(::Type{UTF16String}, dat::AbstractVector{UInt16}) | ||
len, flags, num4byte = unsafe_checkstring(dat) | ||
@inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true) | ||
end | ||
|
||
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | ||
isempty(bytes) && return UTF16String(UInt16[0]) | ||
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) | ||
|
@@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | |
UTF16String(d) | ||
end | ||
|
||
convert(::Type{UTF16String}, str::UTF16String) = str | ||
|
||
utf16(x) = convert(UTF16String, x) | ||
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len)) | ||
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) | ||
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}}) | ||
|
@@ -154,7 +302,7 @@ function map(fun, str::UTF16String) | |
end | ||
uc = reinterpret(UInt32, c2) | ||
if uc < 0x10000 | ||
if utf16_is_surrogate(UInt16(uc)) | ||
if is_surrogate_codeunit(UInt16(uc)) | ||
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) | ||
end | ||
push!(buf, UInt16(uc)) | ||
|
Why wasn't
utf16_is_surrogate
deprecated? This broke JSON.jl (and the million packages that depend on it.....)