Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unicode bugs with UTF-16/UTF-32 conversions (#10959) #11004

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ export
ProcessExitedException,
SystemError,
TypeError,
UnicodeError,
AssertionError,

# Global constants and variables
Expand Down
4 changes: 4 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,15 @@ include("iterator.jl")
include("osutils.jl")

# strings & printing
include("utferror.jl")
include("utftype.jl")
include("utfcheck.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
include("utf16.jl")
include("utf32.jl")
include("utfconvert.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
Expand Down
131 changes: 24 additions & 107 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

immutable UTF16String <: AbstractString
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(ArgumentError("UTF16String data must be NULL-terminated"))
end
new(data)
end
end

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !utf16_is_trail(d[i])
@inbounds cnum += !is_surrogate_trail(d[i])
end
cnum
end
Expand All @@ -30,126 +15,58 @@ function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
return is_surrogate_codepoint(d[i]) ? i-1 : i
end

get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return Char(s.data[i]), i+1
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(ArgumentError("invalid UTF-16 character index"))
ch = s.data[i]
!is_surrogate_codepoint(ch) && return (Char(ch), i+1)
# check length, account for terminating \0
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
ct = s.data[i+1]
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
Char(get_supplementary(ch, ct)), i+2
end

function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
return is_surrogate_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
d =s.data
d = s.data
out = similar(d)
out[end] = 0 # NULL termination
n = length(d)
for i = 1:n-1
out[i] = d[n-i]
if Base.utf16_is_lead(out[i])
out[i],out[i-1] = out[i-1],out[i]
end
end
return UTF16String(out)
end

# TODO: optimize this
function encode16(s::AbstractString)
buf = UInt16[]
for ch in s
c = reinterpret(UInt32, ch)
if c < 0x10000
push!(buf, UInt16(c))
elseif c <= 0x10ffff
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
@inbounds for i = 1:n-1
ch = d[n-i]
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
out[i] = ch
end
end
push!(buf, 0) # NULL termination
UTF16String(buf)
UTF16String(out)
end

utf16(x) = convert(UTF16String, x)
convert(::Type{UTF16String}, s::UTF16String) = s
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data

# TODO: optimize this
convert(::Type{UTF8String}, s::UTF16String) =
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)

sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
while i < n # check for unpaired surrogates
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
@inbounds while i < n # check for unpaired surrogates
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
i += 2
elseif utf16_is_surrogate(data[i])
elseif is_surrogate_codepoint(data[i])
return false
else
i += 1
end
end
return i > n || !utf16_is_surrogate(data[i])
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
UTF16String(copy!(d,1, data,1, len))
end

convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
convert(T, reshape(data, length(data)))

convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
convert(T, reinterpret(UInt16, data))

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
data = reinterpret(UInt16, bytes)
# check for byte-order mark (BOM):
if data[1] == 0xfeff # native byte order
d = Array(UInt16, length(data))
copy!(d,1, data,2, length(data)-1)
elseif data[1] == 0xfffe # byte-swapped
d = Array(UInt16, length(data))
for i = 2:length(data)
d[i-1] = bswap(data[i])
end
else
d = Array(UInt16, length(data) + 1)
copy!(d,1, data,1, length(data)) # assume native byte order
end
d[end] = 0 # NULL terminate
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
UTF16String(d)
end

utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
function utf16(p::Union(Ptr{UInt16}, Ptr{Int16}))
len = 0
while unsafe_load(p, len+1) != 0; len += 1; end
utf16(p, len)
return i > n || !is_surrogate_codepoint(data[i])
end
93 changes: 4 additions & 89 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -1,113 +1,28 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## UTF-32 in the native byte order, i.e. plain old character arrays ##

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(a::Vector{Char})
if length(a) < 1 || a[end] != Char(0)
throw(ArgumentError("UTF32String data must be NULL-terminated"))
end
new(a)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

# UTF-32 basic functions
next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1

utf32(x) = convert(UTF32String, x)
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s

function convert(::Type{UTF32String}, s::AbstractString)
a = Array(Char, length(s) + 1)
i = 0
for c in s
a[i += 1] = c
end
a[end] = Char(0) # NULL terminate
UTF32String(a)
end

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
d = Array(Char, len + 1)
d[end] = Char(0) # NULL terminate
UTF32String(copy!(d,1, data,1, len))
end

convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
convert(UTF32String, reinterpret(Char, data))

convert{T<:AbstractString}(::Type{T}, v::AbstractVector{Char}) = convert(T, utf32(v))

# specialize for performance reasons:
function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
s = IOBuffer(Array(UInt8,length(data)), true, true)
truncate(s,0)
for x in data
print(s, x)
end
convert(T, takebuf_string(s))
end

convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
convert(::Type{Array{Char}}, s::UTF32String) = s.data

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
convert(Ptr{T}, pointer(s))

function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF32String(Char[0])
length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
data = reinterpret(Char, bytes)
# check for byte-order mark (BOM):
if data[1] == Char(0x0000feff) # native byte order
d = Array(Char, length(data))
copy!(d,1, data, 2, length(data)-1)
elseif data[1] == Char(0xfffe0000) # byte-swapped
d = Array(Char, length(data))
for i = 2:length(data)
d[i-1] = bswap(data[i])
end
else
d = Array(Char, length(data) + 1)
copy!(d, 1, data, 1, length(data)) # assume native byte order
end
d[end] = Char(0) # NULL terminate
UTF32String(d)
end

function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
for i=1:length(str)
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
end
return true
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
len = 0
while unsafe_load(p, len+1) != 0; len += 1; end
utf32(p, len)
end

function map(f, s::UTF32String)
d = s.data
out = similar(d)
out[end] = Char(0)
out[end] = 0

for i = 1:(length(d)-1)
@inbounds for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
Expand Down
Loading