JuliaLang · JeffBezanson · Jun 4, 2014 · May 28, 2014 · May 29, 2014 · May 29, 2014
diff --git a/NEWS.md b/NEWS.md
@@ -162,9 +162,15 @@ Library improvements
 
     * Triple-quoted regex strings, `r"""..."""` ([#4934]).
 
-    * New string type, `UTF16String` ([#4930]).
-
-    * `CharString` is renamed to `UTF32String` ([#4943]).
+    * New string type, `UTF16String` ([#4930]), constructed by `utf16(s)`
+      from another string, a `Uint32` array, or a byte array (possibly
+      prefixed by a byte-order marker to indicate endian-ness).  Its
+      data is internally `NULL`-terminated for passing to C ([#7016]).
+
+    * `CharString` is renamed to `UTF32String` ([#4943]), and its data
+      is now internally `NULL`-terminated for passing to C ([#7016]).
+      `CharString(c::Char...)` is deprecated in favor of `utf32(c...)`,
+      and `utf32(s)` otherwise has functionality similar to `utf16(s)`.
 
     * `normalize_string` function to perform Unicode normalization,
       case-folding, and other transformations ([#5576]).

diff --git a/base/char.jl b/base/char.jl
@@ -50,4 +50,4 @@ sizeof(::Type{Char}) = 4
 ## printing & showing characters ##
 
 print(io::IO, c::Char) = (write(io,c); nothing)
-show(io::IO, c::Char) = (print(io,'\''); print_escaped(io,UTF32String(c),"'"); print(io,'\''))
+show(io::IO, c::Char) = (print(io,'\''); print_escaped(io,utf32(c),"'"); print(io,'\''))
diff --git a/base/deprecated.jl b/base/deprecated.jl
@@ -398,6 +398,7 @@ const Stat = StatStruct
 
 export CharString
 const CharString = UTF32String
+@deprecate UTF32String(c::Integer...) utf32(c...)
 
 export Ranges
 const Ranges = Range

diff --git a/base/string.jl b/base/string.jl
@@ -561,33 +561,6 @@ end
 endof(s::GenericString) = endof(s.string)
 next(s::GenericString, i::Int) = next(s.string, i)
 
-## plain old character arrays ##
-
-immutable UTF32String <: DirectIndexString
-    data::Array{Char,1}
-
-    UTF32String(a::Array{Char,1}) = new(a)
-    UTF32String(c::Char...) = new([ c[i] for i=1:length(c) ])
-end
-UTF32String(x...) = UTF32String(map(char,x)...)
-
-next(s::UTF32String, i::Int) = (s.data[i], i+1)
-endof(s::UTF32String) = length(s.data)
-length(s::UTF32String) = length(s.data)
-
-utf32(x) = convert(UTF32String, x)
-convert(::Type{UTF32String}, s::UTF32String) = s
-convert(::Type{UTF32String}, s::String) = UTF32String(Char[c for c in s])
-convert{T<:String}(::Type{T}, v::Vector{Char}) = convert(T, UTF32String(v))
-convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
-convert(::Type{Array{Char}}, s::UTF32String) = s.data
-
-reverse(s::UTF32String) = UTF32String(reverse(s.data))
-
-sizeof(s::UTF32String) = sizeof(s.data)
-convert{T<:Union(Int32,Uint32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
-    convert(Ptr{T}, s.data)
-
 ## substrings reference original strings ##
 
 immutable SubString{T<:String} <: String

diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -77,6 +77,7 @@ include("char.jl")
 include("ascii.jl")
 include("utf8.jl")
 include("utf16.jl")
+include("utf32.jl")
 include("iobuffer.jl")
 include("string.jl")
 include("utf8proc.jl")

diff --git a/base/utf16.jl b/base/utf16.jl
@@ -1,5 +1,11 @@
 immutable UTF16String <: String
-    data::Array{Uint16,1}
+    data::Array{Uint16,1} # includes 16-bit NULL termination after string chars
+    function UTF16String(data::Vector{Uint16})
+        if length(data) < 1 || data[end] != 0
+            throw(ArgumentError("UTF16String data must be NULL-terminated"))
+        end
+        new(data)
+    end
 end
 
 utf16_is_lead(c::Uint16) = (c & 0xfc00) == 0xd800
@@ -9,15 +15,14 @@ utf16_get_supplementary(lead::Uint16, trail::Uint16) = char((lead-0xd7f7)<<10 +
 
 function endof(s::UTF16String)
     d = s.data
-    i = length(d)
+    i = length(d) - 1
     i == 0 && return i
     utf16_is_surrogate(d[i]) ? i-1 : i
 end
-
 function next(s::UTF16String, i::Int)
     if !utf16_is_surrogate(s.data[i])
         return char(s.data[i]), i+1
-    elseif length(s.data) > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
+    elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
         return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
     end
     error("invalid UTF-16 character index")
@@ -34,24 +39,27 @@ function encode16(s::String)
             push!(buf, uint16(0xdc00 + c & 0x3ff))
         end
     end
+    push!(buf, 0) # NULL termination
     UTF16String(buf)
 end
 
 utf16(x) = convert(UTF16String, x)
 convert(::Type{UTF16String}, s::UTF16String) = s
 convert(::Type{UTF16String}, s::String) = encode16(s)
-convert(::Type{UTF8String}, s::UTF16String) =
-    sprint(length(s.data), io->for c in s; write(io,c::Char); end)
 convert(::Type{Array{Uint16,1}}, s::UTF16String) = s.data
 convert(::Type{Array{Uint16}}, s::UTF16String) = s.data
 
-sizeof(s::UTF16String) = sizeof(s.data)
+# TODO: optimize this
+convert(::Type{UTF8String}, s::UTF16String) =
+    sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(Uint16)
 convert{T<:Union(Int16,Uint16)}(::Type{Ptr{T}}, s::UTF16String) =
     convert(Ptr{T}, s.data)
 
 function is_valid_utf16(data::Array{Uint16})
     i = 1
-    n = length(data)
+    n = length(data) # this may include NULL termination; that's okay
     while i < n # check for unpaired surrogates
         if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
             i += 2
@@ -66,21 +74,35 @@ end
 
 is_valid_utf16(s::UTF16String) = is_valid_utf16(s.data)
 
-function convert(::Type{UTF16String}, data::Array{Uint16})
+function convert(::Type{UTF16String}, data::Vector{Uint16})
     !is_valid_utf16(data) && throw(ArgumentError("invalid UTF16 data"))
-    UTF16String(data)
+    len = length(data)
+    d = Array(Uint16, len + 1)
+    d[end] = 0 # NULL terminate
+    UTF16String(copy!(d,1, data,1, len))
 end
 
+convert(T::Type{UTF16String}, data::Array{Uint16}) =
+    convert(T, reshape(data, length(data)))
+
 function convert(T::Type{UTF16String}, bytes::Array{Uint8})
-    isempty(bytes) && return UTF16String(Uint16[])
+    isempty(bytes) && return UTF16String(Uint16[0])
     isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
     data = reinterpret(Uint16, bytes)    
     # check for byte-order mark (BOM):
     if data[1] == 0xfeff        # native byte order
-        convert(T, data[2:end])
+        d = Array(Uint16, length(data))
+        copy!(d,1, data,2, length(data)-1)
     elseif data[1] == 0xfffe    # byte-swapped
-        convert(T, Uint16[bswap(data[i]) for i=2:length(data)])
+        d = Array(Uint16, length(data))
+        for i = 2:length(data)
+            d[i-1] = bswap(data[i])
+        end
     else
-        convert(T, copy(data)) # assume native byte order
+        d = Array(Uint16, length(data) + 1)
+        copy!(d,1, data,1, length(data)) # assume native byte order
     end
+    d[end] = 0 # NULL terminate
+    !is_valid_utf16(d) && throw(ArgumentError("invalid UTF16 data"))
+    UTF16String(d)
 end
diff --git a/base/utf32.jl b/base/utf32.jl
@@ -0,0 +1,90 @@
+## UTF-32 in the native byte order, i.e. plain old character arrays ##
+
+immutable UTF32String <: DirectIndexString
+    data::Array{Char,1} # includes 32-bit NULL termination after string chars
+
+    function UTF32String(a::Array{Char,1})
+        if length(a) < 1 || a[end] != 0
+            throw(ArgumentError("UTF32String data must be NULL-terminated"))
+        end
+        new(a)
+    end
+end
+
+next(s::UTF32String, i::Int) = (s.data[i], i+1)
+endof(s::UTF32String) = length(s.data) - 1
+length(s::UTF32String) = length(s.data) - 1
+
+function utf32(c::Integer...)
+    a = Array(Char, length(c) + 1)
+    for i = 1:length(c)
+        a[i] = c[i]
+    end
+    a[end] = 0
+    UTF32String(a)
+end
+
+utf32(x) = convert(UTF32String, x)
+convert(::Type{UTF32String}, s::UTF32String) = s
+
+function convert(::Type{UTF32String}, s::String)
+    a = Array(Char, length(s) + 1)
+    i = 0
+    for c in s
+        a[i += 1] = c
+    end
+    a[end] = 0 # NULL terminate
+    UTF32String(a)
+end
+
+function convert(::Type{UTF32String}, data::Vector{Char})
+    len = length(data)
+    d = Array(Char, len + 1)
+    d[end] = 0 # NULL terminate
+    UTF32String(copy!(d,1, data,1, len))
+end
+
+convert{T<:Union(Int32,Uint32)}(::Type{UTF32String}, data::Vector{T}) =
+    convert(UTF32String, reinterpret(Char, data))
+
+convert{T<:String}(::Type{T}, v::Vector{Char}) = convert(T, UTF32String(v))
+
+# specialize for performance reasons:
+function convert{T<:ByteString}(::Type{T}, data::Vector{Char})
+    s = IOBuffer(Array(Uint8,length(data)), true, true)
+    truncate(s,0)
+    for x in data
+        print(s, x)
+    end
+    convert(T, takebuf_string(s))
+end
+
+convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
+convert(::Type{Array{Char}}, s::UTF32String) = s.data
+
+reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+
+sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
+convert{T<:Union(Int32,Uint32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
+    convert(Ptr{T}, s.data)
+
+function convert(T::Type{UTF32String}, bytes::Array{Uint8})
+    isempty(bytes) && return UTF32String(Char[0])
+    length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
+    data = reinterpret(Char, bytes)    
+    # check for byte-order mark (BOM):
+    if data[1] == 0x0000feff        # native byte order
+        d = Array(Char, length(data))
+        copy!(d,1, data,2, length(data)-1)
+    elseif data[1] == 0xfffe0000    # byte-swapped
+        d = Array(Char, length(data))
+        for i = 2:length(data)
+            d[i-1] = bswap(data[i])
+        end
+    else
+        d = Array(Char, length(data) + 1)
+        copy!(d,1, data,1, length(data)) # assume native byte order
+    end
+    d[end] = 0 # NULL terminate
+    UTF32String(d)
+end
diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1388,10 +1388,39 @@ Strings
    byte arrays check for a byte-order marker in the first two bytes,
    and do not include it in the resulting string.)
 
+   Note that the resulting ``UTF16String`` data is terminated by NULL
+   codepoint (16-bit zero), which is not treated as a character in the
+   string (so that it is mostly invisible in Julia); this allows the
+   string to be passed directly to external functions requiring
+   NULL-terminated data.  This NULL is appended automatically by the
+   `utf16(s)` conversion function.  If you have a ``Uint16`` array
+   ``A`` that is already NULL-terminated valid UTF-16 data, then you
+   can instead use `UTF16String(A)`` to construct the string without
+   making a copy of the data and treating the NULL as a terminator
+   rather than as part of the string.
+
 .. function:: is_valid_utf16(s) -> Bool
 
    Returns true if the string or ``Uint16`` array is valid UTF-16.
 
+.. function:: utf32(s)
+
+   Create a UTF-32 string from a byte array, array of ``Uint32``, or
+   any other string type.  (Conversions of byte arrays check for a
+   byte-order marker in the first four bytes, and do not include it in
+   the resulting string.)
+
+   Note that the resulting ``UTF32String`` data is terminated by NULL
+   codepoint (32-bit zero), which is not treated as a character in the
+   string (so that it is mostly invisible in Julia); this allows the
+   string to be passed directly to external functions requiring
+   NULL-terminated data.  This NULL is appended automatically by the
+   `utf32(s)` conversion function.  If you have a ``Uint32`` array
+   ``A`` that is already NULL-terminated UTF-32 data, then you
+   can instead use `UTF32String(A)`` to construct the string without
+   making a copy of the data and treating the NULL as a terminator
+   rather than as part of the string.
+
 I/O
 ---
 

diff --git a/test/strings.jl b/test/strings.jl
@@ -816,7 +816,7 @@ bin_val = hex2bytes("07bf")
 @test sizeof(RopeString("abc","def")) == 6
 
 # issue #3597
-@test string(UTF32String(['T', 'e', 's', 't'])[1:1], "X") == "TX"
+@test string(utf32(['T', 'e', 's', 't'])[1:1], "X") == "TX"
 
 # issue #3710
 @test prevind(SubString("{var}",2,4),4) == 3