Skip to content

Commit

Permalink
Rebase against JuliaLang#11575
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 6, 2015
1 parent 7115f04 commit 9f26d58
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 295 deletions.
1 change: 0 additions & 1 deletion base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ include("osutils.jl")
include("utferror.jl")
include("utftypes.jl")
include("utfcheck.jl")
include("utfconvert.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
Expand Down
37 changes: 24 additions & 13 deletions base/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
@inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len))
end

# Get rest of character ch from 3-byte UTF-8 sequence in dat
@inline function get_utf8_3byte(dat, pos, ch)
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
end
# Get rest of character ch from 4-byte UTF-8 sequence in dat
@inline function get_utf8_4byte(dat, pos, ch)
@inbounds return (((ch & 0x7) << 18)
Expand All @@ -23,6 +27,8 @@ end
end
end

const empty_utf16 = UTF16String(UInt16[0])

function length(s::UTF16String)
d = s.data
len = length(d) - 1
Expand Down Expand Up @@ -71,7 +77,7 @@ function reverse(s::UTF16String)
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
out[i] = ch
end
end
UTF16String(out)
Expand Down Expand Up @@ -143,7 +149,7 @@ function convert(::Type{UTF16String}, str::UTF8String)
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf16
# Check that is correct UTF-8 encoding and get number of words needed
len, flags, num4byte = check_string_utf8(dat)
len, flags, num4byte = check_string(dat)
len += num4byte
buf = Vector{UInt16}(len+1)
@inbounds buf[len+1] = 0
Expand Down Expand Up @@ -177,19 +183,23 @@ end

function convert(::Type{UTF8String}, dat::Vector{UInt16})
"
@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String
Converts a UTF-16 encoded vector of UInt16 to a UTF8String
### Input Arguments:
* ::Type{UTF8String}
* dat::Vector{UInt16}
@param[in] ::Type{UTF8String}
@param[in] dat::Vector{UInt16}
### Returns:
* UTF8String
@return ::UTF8String
@throws ArgumentError
""" ->
### Throws:
* UnicodeError
"
len = sizeof(dat)
# handle zero length string quickly
len == 0 && return UTF8String("")
len == 0 && return emtpy_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand All @@ -211,9 +221,9 @@ Converts a UTF16String to a UTF8String
dat = str.data
len = sizeof(dat) >>> 1
# handle zero length string quickly
len <= 1 && return UTF8String("")
len <= 1 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand All @@ -226,7 +236,8 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
* dat Vector{T}
* len length of output in bytes
@return ::UTF8String
### Returns:
* UTF8String
"
buf = Vector{UInt8}(len)
out = 0
Expand Down
26 changes: 22 additions & 4 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,26 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
utf32(x) = convert(UTF32String, x)
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
function convert(::Type{UTF32String}, s::AbstractString)
a = Array(Char, length(s) + 1)
i = 0
for c in s
a[i += 1] = c
end
a[end] = Char(0) # NULL terminate
UTF32String(a)
end

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
d = Array(Char, len + 1)
d[end] = Char(0) # NULL terminate
UTF32String(copy!(d,1, data,1, len))
end

convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
convert(UTF32String, reinterpret(Char, data))
Expand All @@ -27,6 +44,9 @@ end
convert(::Type{Vector{Char}}, str::UTF32String) = str.data
convert(::Type{Array{Char}}, str::UTF32String) = str.data

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
convert(Ptr{T}, pointer(s))

Expand Down Expand Up @@ -59,8 +79,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)

utf32(x) = convert(UTF32String, x)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
Expand Down
6 changes: 3 additions & 3 deletions base/utfcheck.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ end
function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
dat::T,
len = endof(dat),
pos = start(dat)
pos = (T <: AbstractString) ? start(dat) : 1
; options::Integer = 0)
" Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string
### Input Arguments:
* str Vector of UInt16, UInt32, or an AbstractString
* dat Vector of UInt16, UInt32, or an AbstractString
### Optional Input Arguments:
* len length
Expand Down Expand Up @@ -169,7 +169,7 @@ function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif T != Vector{UInt16} && ch > 0x0ffff
elseif ch > 0x0ffff
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
num4byte += 1
elseif !is_surrogate_codeunit(ch)
Expand Down
Loading

0 comments on commit 9f26d58

Please sign in to comment.