-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix #10959 bugs with UTF-16 conversions #11551
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,42 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 | ||
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 | ||
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 | ||
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail) | ||
# Quickly copy and set trailing \0 | ||
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}( | ||
::Type{S}, ::Type{T}, len, dat, flag::Bool=false) | ||
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) | ||
end | ||
|
||
# Get rest of character ch from 3-byte UTF-8 sequence in dat | ||
@inline function get_utf8_3byte(dat, pos, ch) | ||
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) | ||
end | ||
# Get rest of character ch from 4-byte UTF-8 sequence in dat | ||
@inline function get_utf8_4byte(dat, pos, ch) | ||
@inbounds return (((ch & 0x7) << 18) | ||
| (UInt32(dat[pos-2] & 0x3f) << 12) | ||
| (UInt32(dat[pos-1] & 0x3f) << 6) | ||
| (dat[pos] & 0x3f)) | ||
end | ||
|
||
# Output a character as a 4-byte UTF-8 sequence | ||
@inline function output_utf8_4byte!(buf, out, ch) | ||
@inbounds begin | ||
buf[out + 1] = 0xf0 | (ch >>> 18) | ||
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) | ||
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) | ||
buf[out + 4] = 0x80 | (ch & 0x3f) | ||
end | ||
end | ||
|
||
const empty_utf16 = UTF16String(UInt16[0]) | ||
|
||
function length(s::UTF16String) | ||
d = s.data | ||
len = length(d) - 1 | ||
len == 0 && return 0 | ||
cnum = 0 | ||
for i = 1:len | ||
@inbounds cnum += !utf16_is_trail(d[i]) | ||
@inbounds cnum += !is_surrogate_trail(d[i]) | ||
end | ||
cnum | ||
end | ||
|
@@ -20,100 +45,220 @@ function endof(s::UTF16String) | |
d = s.data | ||
i = length(d) - 1 | ||
i == 0 && return i | ||
utf16_is_surrogate(d[i]) ? i-1 : i | ||
return is_surrogate_codeunit(d[i]) ? i-1 : i | ||
end | ||
|
||
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) | ||
|
||
function next(s::UTF16String, i::Int) | ||
if !utf16_is_surrogate(s.data[i]) | ||
return Char(s.data[i]), i+1 | ||
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) | ||
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 | ||
end | ||
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0)) | ||
ch = s.data[i] | ||
!is_surrogate_codeunit(ch) && return (Char(ch), i+1) | ||
# check length, account for terminating \0 | ||
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))) | ||
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch)) | ||
ct = s.data[i+1] | ||
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch)) | ||
Char(get_supplementary(ch, ct)), i+2 | ||
end | ||
|
||
function reverseind(s::UTF16String, i::Integer) | ||
j = length(s.data) - i | ||
return Base.utf16_is_trail(s.data[j]) ? j-1 : j | ||
return is_surrogate_trail(s.data[j]) ? j-1 : j | ||
end | ||
|
||
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator | ||
|
||
function reverse(s::UTF16String) | ||
d =s.data | ||
d = s.data | ||
out = similar(d) | ||
out[end] = 0 # NULL termination | ||
n = length(d) | ||
for i = 1:n-1 | ||
out[i] = d[n-i] | ||
if Base.utf16_is_lead(out[i]) | ||
out[i],out[i-1] = out[i-1],out[i] | ||
@inbounds for i = 1:n-1 | ||
ch = d[n-i] | ||
if is_surrogate_lead(ch) | ||
out[i],out[i-1] = out[i-1],ch | ||
else | ||
out[i] = ch | ||
end | ||
end | ||
return UTF16String(out) | ||
UTF16String(out) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leave the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's not the convention in the surrounding code, and the code before any of my changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It shows up in the diff as you deleting the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was just making it more consistent with most all of the other locations in the utf*.jl files. |
||
end | ||
|
||
# TODO: optimize this | ||
function encode16(s::AbstractString) | ||
buf = UInt16[] | ||
for ch in s | ||
c = reinterpret(UInt32, ch) | ||
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) | ||
|
||
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | ||
i = 1 | ||
n = length(data) # this may include NULL termination; that's okay | ||
@inbounds while i < n # check for unpaired surrogates | ||
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) | ||
i += 2 | ||
elseif is_surrogate_codeunit(data[i]) | ||
return false | ||
else | ||
i += 1 | ||
end | ||
end | ||
return i > n || !is_surrogate_codeunit(data[i]) | ||
end | ||
|
||
" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i dislike this particular comment on the sole grounds that it provides no additional information in 14 lines that was not provided by the subsequent one line of code ( i agree that base could use more frequent comments (and generally appreciate the inline comments you've put in this PR), but let's not write text just for the sake of writing text. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's not true, it gives both the return type of the function, and lists what errors are thrown. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, but all information which is provided by the signature should not be repeated in the comment. The doc system should be smart enough to extract that automatically. (I think I said that in another comment already.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please go look at the latest version - there is information there that could not be extracted from the signature. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And yes, you are correct, and I totally agree, that if the information is provided by the signature, it doesn't need to be repeated. The signature also does not indicate whether a parameter is used for input, output, or both, something that my original doxygen format comments made nicely explicit, with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's better, but if you think in terms of how the docs will be presented when they will be supported in Base, the generic The rest of the doc looks fine to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. About the comment on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, the lines between |
||
Converts an `AbstractString` to a `UTF16String` | ||
|
||
### Returns: | ||
* `UTF16String` | ||
|
||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF16String}, str::AbstractString) | ||
len, flags, num4byte = unsafe_checkstring(str) | ||
buf = Vector{UInt16}(len+num4byte+1) | ||
out = 0 | ||
@inbounds for ch in str | ||
c = UInt32(ch) | ||
if c < 0x10000 | ||
push!(buf, UInt16(c)) | ||
elseif c <= 0x10ffff | ||
push!(buf, UInt16(0xd7c0 + (c>>10))) | ||
push!(buf, UInt16(0xdc00 + (c & 0x3ff))) | ||
buf[out += 1] = UInt16(c) | ||
else | ||
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) | ||
# output surrogate pair | ||
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) | ||
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) | ||
end | ||
end | ||
push!(buf, 0) # NULL termination | ||
@inbounds buf[out + 1] = 0 # NULL termination | ||
UTF16String(buf) | ||
end | ||
|
||
utf16(x) = convert(UTF16String, x) | ||
convert(::Type{UTF16String}, s::UTF16String) = s | ||
convert(::Type{UTF16String}, s::AbstractString) = encode16(s) | ||
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data | ||
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data | ||
" | ||
Converts a `UTF8String` to a `UTF16String` | ||
|
||
# TODO: optimize this | ||
convert(::Type{UTF8String}, s::UTF16String) = | ||
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end) | ||
### Returns: | ||
* `UTF16String` | ||
|
||
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) | ||
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = | ||
convert(Ptr{T}, pointer(s)) | ||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF16String}, str::UTF8String) | ||
dat = str.data | ||
# handle zero length string quickly | ||
sizeof(dat) == 0 && return empty_utf16 | ||
# Check that is correct UTF-8 encoding and get number of words needed | ||
len, flags, num4byte = unsafe_checkstring(dat) | ||
len += num4byte | ||
buf = Vector{UInt16}(len+1) | ||
@inbounds buf[len+1] = 0 | ||
# Optimize case where no characters > 0x7f | ||
flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle range 0x80-0x7ff | ||
elseif ch < 0xe0 | ||
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) | ||
# Handle range 0x800-0xffff | ||
elseif ch < 0xf0 | ||
pos += 2 | ||
buf[out += 1] = get_utf8_3byte(dat, pos, ch) | ||
# Handle range 0x10000-0x10ffff | ||
else | ||
pos += 3 | ||
ch = get_utf8_4byte(dat, pos, ch) | ||
# output surrogate pair | ||
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) | ||
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) | ||
end | ||
end | ||
UTF16String(buf) | ||
end | ||
|
||
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | ||
i = 1 | ||
n = length(data) # this may include NULL termination; that's okay | ||
while i < n # check for unpaired surrogates | ||
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1]) | ||
i += 2 | ||
elseif utf16_is_surrogate(data[i]) | ||
return false | ||
" | ||
Converts a `UTF16String` to a `UTF8String` | ||
|
||
### Returns: | ||
* `UTF8String` | ||
|
||
### Throws: | ||
* `UnicodeError` | ||
" | ||
function convert(::Type{UTF8String}, str::UTF16String) | ||
dat = str.data | ||
len = sizeof(dat) >>> 1 | ||
# handle zero length string quickly | ||
len <= 1 && return empty_utf8 | ||
# get number of bytes to allocate | ||
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) | ||
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) | ||
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) | ||
end | ||
|
||
" | ||
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` | ||
|
||
### Input Arguments: | ||
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted | ||
* `len` length of output in bytes | ||
|
||
### Returns: | ||
* `UTF8String` | ||
" | ||
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was more for the future, where I want the first argument to really be an Encoding, see @quinnj's nice Strings.jl that he's been working on. |
||
buf = Vector{UInt8}(len) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle 0x80-0x7ff | ||
elseif ch < 0x800 | ||
buf[out += 1] = 0xc0 | (ch >>> 6) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
# Handle 0x10000-0x10ffff (if input is UInt32) | ||
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 | ||
output_utf8_4byte!(buf, out, ch) | ||
out += 4 | ||
# Handle surrogate pairs | ||
elseif is_surrogate_codeunit(ch) | ||
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) | ||
out += 4 | ||
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters | ||
else | ||
i += 1 | ||
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) | ||
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
end | ||
end | ||
return i > n || !utf16_is_surrogate(data[i]) | ||
UTF8String(buf) | ||
end | ||
|
||
function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) | ||
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) | ||
len = length(data) | ||
d = Array(UInt16, len + 1) | ||
d[end] = 0 # NULL terminate | ||
UTF16String(copy!(d,1, data,1, len)) | ||
function convert(::Type{UTF16String}, str::ASCIIString) | ||
dat = str.data | ||
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true) | ||
end | ||
|
||
convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data | ||
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data | ||
|
||
convert(::Type{UTF16String}, str::UTF16String) = str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe that was there before - I'm pretty sure all I did was change s -> str. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, just wasn't showing up very clearly in the diff. Seems like a vacuous definition though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wondered about those myself, but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thought just came to me... since strings are not (currently 😀) validated, maybe those There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@nalimilan The string conversion functions (except for some bugs) all have the contract that they validate the string. However, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the only behavior that makes sense for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not paranoia, it's just that since the type constructors do not guarantee a valid string (in fact, they basically let you get away with anything... because they take There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the only way to solve this, is to move all validation/conversion into the inner constructors... which would also eliminate the hole of making a mutable string... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that there is already a generic convert definition for |
||
|
||
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = | ||
convert(Ptr{T}, pointer(s)) | ||
|
||
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = | ||
convert(T, reshape(data, length(data))) | ||
|
||
convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = | ||
convert(T, reinterpret(UInt16, data)) | ||
|
||
function convert(::Type{UTF16String}, dat::AbstractVector{UInt16}) | ||
len, flags, num4byte = unsafe_checkstring(dat) | ||
@inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true) | ||
end | ||
|
||
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | ||
isempty(bytes) && return UTF16String(UInt16[0]) | ||
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) | ||
|
@@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | |
UTF16String(d) | ||
end | ||
|
||
convert(::Type{UTF16String}, str::UTF16String) = str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but this was about fixing bugs, not removing stuff, I was told many times not to do too many things in a single PR, so I was planning on addressing that after hopefully these bug fix PRs get in. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It turns out, that that line in julia> @which convert(UTF16String, utf16("foo"))
convert(::Type{UTF16String}, s::AbstractString) at utf16.jl:75 which is defined as: convert(::Type{UTF16String}, s::AbstractString) = encode16(s) So I don't think I can do anything about those, they are really there for a reason, to keep the AbstractString version from being used. |
||
|
||
utf16(x) = convert(UTF16String, x) | ||
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len)) | ||
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) | ||
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}}) | ||
|
@@ -154,7 +302,7 @@ function map(fun, str::UTF16String) | |
end | ||
uc = reinterpret(UInt32, c2) | ||
if uc < 0x10000 | ||
if utf16_is_surrogate(UInt16(uc)) | ||
if is_surrogate_codeunit(UInt16(uc)) | ||
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) | ||
end | ||
push!(buf, UInt16(uc)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the purpose of the
flag
argument, since you always seem to passtrue
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That was for the conversion that @tkelman absolutely did not want in the PR (which is probably the most useful one for me), conversion of
Vector{UInt16}
toUTF8String
, which I've removed.I'd rather keep the flag, so that this can be used elsewhere (and it won't affect the code generated anyway, nice thing about Julia!)