Skip to content

Commit

Permalink
add UTF16String (closes #4930)
Browse files Browse the repository at this point in the history
  • Loading branch information
nolta committed Nov 26, 2013
1 parent 2764c01 commit db2c6ef
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 0 deletions.
2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ export
TmStruct,
Triangular,
Tridiagonal,
UTF16String,
VecOrMat,
Vector,
VersionNumber,
Expand Down Expand Up @@ -817,6 +818,7 @@ export
unescape_string,
uppercase,
utf8,
utf16,
warn,
xdump,

Expand Down
1 change: 1 addition & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ const DL_LOAD_PATH = ByteString[]
include("char.jl")
include("ascii.jl")
include("utf8.jl")
include("utf16.jl")
include("iobuffer.jl")
include("string.jl")
include("regex.jl")
Expand Down
46 changes: 46 additions & 0 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

immutable UTF16String <: String
data::Array{Uint16,1}
end

utf16_is_lead(c::Uint16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::Uint16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::Uint16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::Uint16, trail::Uint16) = char((lead-0xd7f7)<<10 + trail)

function endof(s::UTF16String)
d = s.data
i = length(d)
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
end

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return char(s.data[i]), i+1
elseif length(s.data) > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
error("invalid UTF-16 character index")
end

# TODO: optmize this
function encode16(s::String)
buf = Uint16[]
for c in s
if c < 0x10000
push!(buf, uint16(c))
else
push!(buf, uint16(0xd7c0 + (c>>10) & 0x3ff))
push!(buf, uint16(0xdc00 + c & 0x3ff))
end
end
UTF16String(buf)
end

utf16(x) = convert(UTF16String, x)
convert(::Type{UTF16String}, s::UTF16String) = s
convert(::Type{UTF16String}, s::String) = encode16(s)
convert(::Type{UTF8String}, s::UTF16String) =
sprint(length(s.data), io->for c in s; write(io,c::Char); end)

8 changes: 8 additions & 0 deletions test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,11 @@ for encoding in ["UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"]
rm(joinpath(unicodedir,encoding*".unicode"))
end
rmdir(unicodedir)

# UTF16
u8 = "𝕥𝟶f𠂊"
u16 = utf16(u8)
@test length(u16.data) == 7
@test length(u16) == 4
@test utf8(u16) == u8
@test collect(u8) == collect(u16)

0 comments on commit db2c6ef

Please sign in to comment.