From dd5b4793faa304802507eb983cf0c98c7ea20d83 Mon Sep 17 00:00:00 2001
From: Scott Paul Jones <scottjones@alum.mit.edu>
Date: Wed, 6 May 2015 23:58:38 -0400
Subject: [PATCH 1/3] Fix #10959 bugs with Unicode string conversions Make most
 common conversions 2-22x Add standard error reporting functionality    
 (which can be extended later on for locale specific error messages) Improve
 error messages, reporting invalid character position and value Add in-line
 documentation Add validation functions for `UTF-8`, `UTF-16`, `UTF-32` (and
 `AbstractString`) Add testing of different `UTF` encoding issues:     Long
 encoding of `\0` (sometimes known as `Modified UTF-8`)     Overlong encodings
 of other values     Encoding of surrogate pairs as 2 3-byte sequences (i.e.
 `CESU-8`)     Invalid bytes (`0xF5..0xFF`)     Unexpected continuation bytes 
    Lead byte with missing continuation bytes     4-byte sequence starting
 with `0xF4`, that represents value > `0x10ffff` Allow conversion (but not
 production) of `Modified UTF-8` and `CESU-8`     (used by Java, Oracle,
 MySQL, and many others) See http://en.wikipedia.org/wiki/UTF-8 for discussion
 of these issues and more

---
 base/exports.jl |   1 +
 base/sysimg.jl  |   3 +-
 base/utf.jl     | 993 ++++++++++++++++++++++++++++++++++++++++++++++++
 base/utf16.jl   | 155 --------
 base/utf32.jl   | 118 ------
 test/strings.jl | 149 ++++++++
 6 files changed, 1144 insertions(+), 275 deletions(-)
 create mode 100644 base/utf.jl
 delete mode 100644 base/utf16.jl
 delete mode 100644 base/utf32.jl

diff --git a/base/exports.jl b/base/exports.jl
index 7547a09d791a0..660a3b5a5517f 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -164,6 +164,7 @@ export
     ProcessExitedException,
     SystemError,
     TypeError,
+    UnicodeError,
     AssertionError,
 
 # Global constants and variables
diff --git a/base/sysimg.jl b/base/sysimg.jl
index 73ead07c577f3..b44340b0216a9 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -87,8 +87,7 @@ include("osutils.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")
-include("utf16.jl")
-include("utf32.jl")
+include("utf.jl")
 include("iobuffer.jl")
 include("string.jl")
 include("utf8proc.jl")
diff --git a/base/utf.jl b/base/utf.jl
new file mode 100644
index 0000000000000..b170973406b43
--- /dev/null
+++ b/base/utf.jl
@@ -0,0 +1,993 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#=
+@doc """
+@brief      Error messages for Unicode / UTF support
+""" ->
+=#
+
+const UTF_ERR_SHORT = 1
+const UTF_ERR_CONT  = 2
+const UTF_ERR_LONG  = 3
+const UTF_ERR_NOT_LEAD = 4
+const UTF_ERR_NOT_TRAIL = 5
+const UTF_ERR_NOT_SURROGATE = 6
+const UTF_ERR_MISSING_SURROGATE = 7
+const UTF_ERR_INVALID = 8
+const UTF_ERR_SURROGATE = 9
+const UTF_ERR_NULL_16_TERMINATE = 10
+const UTF_ERR_NULL_32_TERMINATE = 11
+const UTF_ERR_MAX = 11
+
+const errMsgs = [
+    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
+    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
+    "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
+    "not a leading Unicode surrogate character at index <<1>> (0x<<2>>)",
+    "not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)",
+    "not a valid Unicode surrogate character at index <<1>> (0x<<2>>",
+    "missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)",
+    "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
+    "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
+    "UTF16String data must be NULL-terminated",
+    "UTF32String data must be NULL-terminated"
+]
+#=
+@doc """
+@brief      Throws ArgumentError with information about the specific error, location, and character
+
+@param[in]  errcode Error code for Unicode error (one of UTF_ERR_*)
+@param[in]  charpos Index of invalid byte or character
+@param[in]  invchar Invalid byte or character
+
+@throws never returns, always throws ArgumentError
+""" ->
+=#
+function utf_errfunc(errcode::Integer, charpos, invchar)
+    if errcode < 1 || errcode > UTF_ERR_MAX
+        throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
+    end
+    throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
+end
+
+#=
+@doc """
+@brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
+""" ->
+=#
+immutable UTF16String <: AbstractString
+    data::Vector{UInt16} # includes 16-bit NULL termination after string chars
+    function UTF16String(data::Vector{UInt16})
+        if length(data) < 1 || data[end] != 0
+            utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0)
+        end
+        new(data)
+    end
+end
+
+#=
+@doc """
+@brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
+""" ->
+=#
+immutable UTF32String <: DirectIndexString
+    data::Vector{Char} # includes 32-bit NULL termination after string chars
+
+    function UTF32String(data::Vector{Char})
+        if length(data) < 1 || data[end] != Char(0)
+            utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0)
+        end
+        new(data)
+    end
+end
+UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
+
+const empty_utf16 = UTF16String(UInt16[0])
+const empty_utf32 = UTF32String(UInt32[0])
+
+is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
+is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
+is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
+function length(s::UTF16String)
+    d = s.data
+    len = length(d) - 1
+    len == 0 && return 0
+    cnum = 0
+    for i = 1:len
+        @inbounds cnum += !is_surrogate_trail(d[i])
+    end
+    cnum
+end
+
+function endof(s::UTF16String)
+    d = s.data
+    i = length(d) - 1
+    i == 0 && return i
+    return is_surrogate_char(d[i]) ? i-1 : i
+end
+
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
+function next(s::UTF16String, i::Int)
+    ch = s.data[i]
+    !is_surrogate_char(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
+    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
+    Char(get_supplementary(ch, ct)), i+2
+end
+
+function reverseind(s::UTF16String, i::Integer)
+    j = length(s.data) - i
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
+end
+
+lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
+
+function reverse(s::UTF16String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0 # NULL termination
+    n = length(d)
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
+        end
+    end
+    UTF16String(out)
+end
+
+next(s::UTF32String, i::Int) = (s.data[i], i+1)
+endof(s::UTF32String) = length(s.data) - 1
+length(s::UTF32String) = length(s.data) - 1
+
+const UTF_NO_LONG_NULL = 1      # don't accept 0xc0 0x80 for '\0'
+const UTF_NO_SURROGATES = 2     # don't accept surrogate pairs in UTF-8/UTF-32
+const UTF_ACCEPT_LONG = 4       # accept long encodings (other than long null in UTF-8)
+
+const UTF_LONG = 1              # Long encodings are present
+const UTF_LATIN1 = 2            # characters in range 0x80-0xFF present
+const UTF_UNICODE2 = 4          # characters in range 0x100-0x7ff present
+const UTF_UNICODE3 = 8          # characters in range 0x800-0xd7ff, 0xe000-0xffff
+const UTF_UNICODE4 = 16         # non-BMP characters present
+const UTF_SURROGATE = 32        # surrogate pairs present
+
+# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
+@inline function get_continuation(ch::UInt32, str, pos)
+    byt::UInt8 = str[pos += 1]
+    !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
+    (ch << 6) | (byt & 0x3f), pos
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a string
+
+@param[in]  str     Vector of UInt8
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
+    local byt::UInt8
+    local ch::UInt32, surr::UInt32
+    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
+    pos = 0
+    len = sizeof(dat)
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            # Check UTF-8 encoding
+            if ch < 0xe0
+                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
+                (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch, pos = get_continuation(ch & 0x3f, dat, pos)
+                if ch > 0x7f
+                    num2byte += 1
+                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
+                    flags |= UTF_LONG
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos, ch)
+                end
+             elseif ch < 0xf0
+                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
+                (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch, pos = get_continuation(ch & 0x0f, dat, pos)
+                ch, pos = get_continuation(ch, dat, pos)
+                # check for surrogate pairs, make sure correct
+                if is_surrogate_char(ch)
+                    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
+                    # next character *must* be a trailing surrogate character
+                    (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
+                    byt = dat[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
+                    surr, pos = get_continuation(0x0000d, dat, pos)
+                    surr, pos = get_continuation(surr, dat, pos)
+                    !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
+                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
+                    flags |= UTF_SURROGATE
+                    num4byte += 1
+                elseif ch > 0x07ff
+                    num3byte += 1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                    num2byte += 1
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            elseif ch < 0xf5
+                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
+                (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch, pos = get_continuation(ch & 0x07, dat, pos)
+                ch, pos = get_continuation(ch, dat, pos)
+                ch, pos = get_continuation(ch, dat, pos)
+                if ch > 0x10ffff
+                    utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
+                elseif ch > 0xffff
+                    num4byte += 1
+                elseif is_surrogate_char(ch)
+                    utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    # This is an overly long encode character
+                    flags |= UTF_LONG
+                    if ch > 0x7ff
+                        num3byte += 1
+                    elseif ch > 0x7f
+                        num2byte += 1
+                    end
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            else
+                utf_errfunc(UTF_ERR_INVALID, pos, ch)
+            end
+        end
+    end
+    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-16 string
+
+@param[in]  dat     Vector{UInt16}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf16(dat::Vector{UInt16}, len::Int)
+    local ch::UInt32
+    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
+    local pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif !is_surrogate_char(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-32 string
+
+@param[in]  dat     Vector{UInt32}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
+    local ch::UInt32
+    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
+    local pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_char(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+end
+
+function check_string_abs(str::AbstractString, options::Integer=0)
+    local ch::UInt32
+    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
+    local pos = start(str)
+    local len = endof(str)
+    @inbounds while pos < len
+        ch, pos = next(str, pos)
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_char(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch, pos = next(str, pos)
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+end
+
+# Quickly copy and set trailing \0
+macro return_fast_utf_copy(T1, T2, len, dat)
+    quote
+        @inbounds return $(esc(T1))(setindex!(copy!(Vector{$(esc(T2))}($(esc(len))), $(esc(dat))), 0, $(esc(len))))
+    end
+end
+
+# Get rest of character ch from 2-byte UTF-8 sequence in str, update pos and return character
+macro get_utf8_2!(str, pos, ch)
+    quote
+        (($(esc(ch)) & 0x1f) << 6) | ($(esc(str))[$(esc(pos)) += 1] & 0x3f)
+    end
+end
+
+# Get rest of character ch from 3-byte UTF-8 sequence in str, update pos and return character
+macro get_utf8_3!(str, pos, ch)
+    quote
+        ($(esc(pos)) += 2 ;
+         (($(esc(ch)) & 0xf) << 12)
+          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
+          | ($(esc(str))[$(esc(pos))] & 0x3f))
+    end
+end
+
+# Get rest of character ch from 4-byte UTF-8 sequence in str, update pos and return character
+macro get_utf8_4!(str, pos, ch)
+    quote
+        ($(esc(pos)) += 3 ;
+         (($(esc(ch)) & 0x7) << 18)
+          | (UInt32($(esc(str))[$(esc(pos))-2] & 0x3f) << 12)
+          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
+          | ($(esc(str))[$(esc(pos))] & 0x3f))
+    end
+end
+
+# Get the trailing surrogate character in UTF-8 from an array, update the position
+macro get_utf8_surr!(str, pos)
+    quote
+        ($(esc(pos)) += 3 ;
+         ((UInt32($(esc(str))[$(esc(pos))-2] & 0xf) << 12)
+          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
+          | ($(esc(str))[$(esc(pos))] & 0x3f)))
+    end
+end
+
+# Output a character as a 2-byte UTF-8 sequence, update the position
+macro output_utf8_2!(buf, out, ch)
+    quote
+        $(esc(buf))[$(esc(out)) += 1] = 0xc0 | ($(esc(ch)) >>> 6)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
+    end
+end
+# Output a character as a 3-byte UTF-8 sequence, update the position
+macro output_utf8_3!(buf, out, ch)
+    quote
+        $(esc(buf))[$(esc(out)) += 1] = 0xe0 | (($(esc(ch)) >>> 12) & 0x3f)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
+    end
+end
+# Output a character as a 4-byte UTF-8 sequence, update the position
+macro output_utf8_4!(buf, out, ch)
+    quote
+        $(esc(buf))[$(esc(out)) += 1] = 0xf0 | ($(esc(ch)) >>> 18)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 12) & 0x3f)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f)
+        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
+    end
+end
+
+# Output a UTF-16 surrogate pair, update the position
+macro output_utf16_surr!(buf, out, ch)
+    quote
+        $(esc(buf))[$(esc(out)) += 1] = UInt16(0xd7c0 + ($(esc(ch)) >>> 10))
+        $(esc(buf))[$(esc(out)) += 1] = UInt16(0xdc00 + ($(esc(ch)) & 0x3ff))
+    end
+end
+
+#=
+"""
+@brief      Converts an AbstractString to a UTF16String
+
+@param[in]  ::Type{UTF16String}
+@param[in]  str::AbstractString
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, str::AbstractString)
+    len, flags, num4byte = check_string_abs(str)
+    buf = Vector{UInt16}(len+num4byte+1)
+    out = 0
+    @inbounds for ch in str
+        c = reinterpret(UInt32, ch)
+        if c < 0x10000
+            buf[out += 1] = UInt16(c)
+        else
+            @output_utf16_surr!(buf, out, c)
+        end
+    end
+    @inbounds buf[out + 1] = 0 # NULL termination
+    UTF16String(buf)
+end
+
+#=
+"""
+@brief      Converts an AbstractString to a UTF32String
+
+@param[in]  ::Type{UTF32String}
+@param[in]  str::AbstractString
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::AbstractString)
+    len, flags = check_string_abs(str)
+    buf = Vector{Char}(len+1)
+    out = 0
+    @inbounds for ch in str ; buf[out += 1] = ch ; end
+    @inbounds buf[out + 1] = 0 # NULL termination
+    UTF32String(buf)
+end
+
+#=
+@doc """
+@brief      Converts a UTF-8 encoded string to UTF-16 encoding
+
+@param[in]  str::Vector{UInt8}
+
+@return     ::UTF16String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF16String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf16
+    # Check that is correct UTF-8 encoding and get number of words needed
+    len, flags, num4byte = check_string_utf8(dat)
+    len += num4byte
+    buf = Vector{UInt16}(len+1)
+    buf[len+1] = 0
+    # Optimize case where no characters > 0x7f
+    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
+    out::UInt = 0
+    pos::UInt = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = @get_utf8_2!(dat, pos, ch)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            buf[out += 1] = @get_utf8_3!(dat, pos, ch)
+        # Handle range 0x10000-0x10ffff
+        else
+            ch = @get_utf8_4!(dat, pos, ch)
+            @output_utf16_surr!(buf, out, ch)
+        end
+    end
+    UTF16String(buf)
+end
+
+#=
+@doc """
+@brief      Reencodes a UTF-16 or UTF-32 encoded string using UTF-8 encoding
+
+@param[in]  str::Union(Vector{UInt16}, Vector{UInt32})
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, dat::Vector{UInt16})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Converts a UTF16String to a UTF8String
+
+@param[in]  str::UTF16String
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat) >>> 1
+    # handle zero length string quickly
+    len <= 1 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Encodes a vector of UInt32 to a UTF8String
+
+@param[in]  dat::Vector{UInt32}
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, dat::Vector{UInt32})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Converts a UTF32String to a UTF8String
+
+@param[in]  str::UTF32String
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String},  str::UTF32String)
+    dat = reinterpret(UInt32, str.data)
+    len = sizeof(dat) >>> 2
+    # handle zero length string quickly
+    len <= 1 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Encodes an already validated vector of UInt16 or UInt32 as UTF-8
+
+@param[in]  T           type (UInt16 or UInt32)
+@param[in]  dat         Vector{T}
+@param[in]  len         length of output in bytes
+
+@return     ::UTF8String
+""" ->
+=#
+function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
+    buf = Vector{UInt8}(len)
+    out::UInt = 0
+    pos::UInt = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle 0x80-0x7ff
+        elseif ch < 0x800
+            @output_utf8_2!(buf, out, ch)
+        # Handle 0x10000-0x10ffff (if input is UInt32)
+        elseif T == UInt32 && ch > 0xffff
+            @output_utf8_4!(buf, out, ch)
+        # Handle surrogate pairs
+        elseif is_surrogate_char(ch)
+            ch = get_supplementary(ch, dat[pos += 1])
+            @output_utf8_4!(buf, out, ch)
+        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
+        else
+            @output_utf8_3!(buf, out, ch)
+        end
+    end
+    UTF8String(buf)
+end
+
+#=
+"""
+@brief      Converts a UTF-8 encoded string to UTF-32 encoding
+
+@param[in]  dat::Vector{UInt8}
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf32
+    # Validate UTF-8 encoding, and get number of words to create
+    len, flags = check_string_utf8(dat)
+    # Optimize case where no characters > 0x7f
+    totlen = len+1
+    flags == 0 && @return_fast_utf_copy(UTF32String, Char, totlen, dat)
+    # has multi-byte UTF-8 sequences
+    buf = Vector{Char}(totlen)
+    @inbounds buf[totlen] = 0 # NULL termination
+    local ch::UInt32
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = @get_utf8_2!(dat, pos, ch)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            ch = @get_utf8_3!(dat, pos, ch)
+            # Handle surrogate pairs (should have been encoded in 4 bytes)
+            if is_surrogate_lead(ch)
+                # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
+                ch = get_supplementary(ch, @get_utf8_surr!(dat, pos))
+            end
+            buf[out += 1] = ch
+        # Handle range 0x10000-0x10ffff
+        else
+            buf[out += 1] = @get_utf8_4!(dat, pos, ch)
+        end
+    end
+    UTF32String(buf)
+end
+
+#=
+"""
+@brief      Converts a UTF16String to UTF32String
+
+@param[in]  str::UTF16String
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat)
+    # handle zero length string quickly (account for trailing \0)
+    len <= 2 && return empty_utf32
+    # get number of words to create
+    len, flags, num4byte = check_string_utf16(dat, len>>>1)
+    # No surrogate pairs, do optimized copy
+    (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
+    local ch::UInt32
+    buf = Vector{Char}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = dat[pos += 1]
+        # check for surrogate pair
+        if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
+        buf[out += 1] = ch
+    end
+    UTF32String(buf)
+end
+
+#=
+"""
+@brief      Converts a Vector of UInt32 to a UTF16String
+
+@param[in]  dat::Vector{UInt32}
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, dat::Vector{UInt32})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len <= 4 && return empty_utf16
+    # get number of words to allocate
+    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    len += num4byte + 1
+    # optimized path, no surrogates
+    num4byte == 0 && @return_fast_utf_copy(UTF16String, UInt16, len, dat)
+    return encode_to_utf16(dat, len)
+end
+
+#=
+"""
+@brief      Converts a UTF32String to UTF16String
+
+@param[in]  str::UTF32String
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, str::UTF32String)
+    dat = reinterpret(UInt32, str.data)
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len <= 4 && return empty_utf16
+    # get number of words to allocate
+    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    # optimized path, no surrogates
+    num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
+    return encode_to_utf16(dat, len + num4byte)
+end
+
+#=
+@doc """
+@brief      Encodes an already validated Vector of UInt32 as UTF-16
+
+@param[in]  dat         Vector{UInt32}
+@param[in]  len         length of output in 16-bit words
+
+@return     ::UTF16String
+""" ->
+=#
+function encode_to_utf16(dat, len)
+    buf = Vector{UInt16}(len)
+    @inbounds buf[len] = 0 # NULL termination
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = UInt32(dat[pos += 1])
+        if ch > 0xffff
+            # Output surrogate pair for 0x10000-0x10ffff
+            buf[out += 1] = 0xd7c0 + (ch >>> 10)
+            ch = 0xdc00 + (ch & 0x3ff)
+        end
+        buf[out += 1] = ch
+    end
+    UTF16String(buf)
+end
+
+convert(::Type{UTF8String},  dat::Vector{Char})   = convert(UTF8String, reinterpret(UInt32, dat))
+
+utf16(x) = convert(UTF16String, x)
+
+function convert(::Type{UTF16String}, str::ASCIIString)
+    dat = str.data
+    len = length(dat)+1
+    @return_fast_utf_copy(UTF16String, UInt16, len, dat)
+end
+
+function convert(::Type{UTF32String}, str::ASCIIString)
+    dat = str.data
+    len = length(dat)+1
+    @return_fast_utf_copy(UTF32String, Char, len, dat)
+end
+
+convert(::Type{UTF16String}, str::UTF16String)    = str
+convert(::Type{UTF16String}, dat::Vector{Char})   = convert(UTF16String, reinterpret(UInt32, dat))
+
+convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
+convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
+
+utf32(x) = convert(UTF32String, x)
+
+convert(::Type{UTF32String}, str::UTF32String)    = str
+
+convert(::Type{UTF32String}, c::Char)             = UTF32String(Char[c, Char(0)])
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
+    convert(Ptr{T}, pointer(s))
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_char(data[i])
+            return false
+        else
+            i += 1
+        end
+    end
+    return i > n || !is_surrogate_char(data[i])
+end
+
+function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
+    !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
+    len = length(data)
+    @inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
+end
+
+convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
+    convert(T, reshape(data, length(data)))
+
+convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
+    convert(T, reinterpret(UInt16, data))
+
+function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
+    isempty(bytes) && return UTF16String(UInt16[0])
+    isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
+    data = reinterpret(UInt16, bytes)
+    # check for byte-order mark (BOM):
+    if data[1] == 0xfeff        # native byte order
+        d = Array(UInt16, length(data))
+        copy!(d,1, data,2, length(data)-1)
+    elseif data[1] == 0xfffe    # byte-swapped
+        d = Array(UInt16, length(data))
+        for i = 2:length(data)
+            d[i-1] = bswap(data[i])
+        end
+    else
+        d = Array(UInt16, length(data) + 1)
+        copy!(d,1, data,1, length(data)) # assume native byte order
+    end
+    d[end] = 0 # NULL terminate
+    !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
+    UTF16String(d)
+end
+
+utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
+utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
+function utf16(p::Union(Ptr{UInt16}, Ptr{Int16}))
+    len = 0
+    while unsafe_load(p, len+1) != 0; len += 1; end
+    utf16(p, len)
+end
+
+function convert(::Type{UTF32String}, data::AbstractVector{Char})
+    len = length(data)
+    @inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
+end
+
+convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
+    convert(UTF32String, reinterpret(Char, data))
+
+convert{T<:AbstractString}(::Type{T}, v::AbstractVector{Char}) = convert(T, utf32(v))
+
+# specialize for performance reasons:
+function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
+    s = IOBuffer(Array(UInt8,length(data)), true, true)
+    truncate(s,0)
+    for x in data
+        print(s, x)
+    end
+    convert(T, takebuf_string(s))
+end
+
+convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
+convert(::Type{Array{Char}},   s::UTF32String) = s.data
+
+reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+
+sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
+unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
+    convert(Ptr{T}, pointer(s))
+
+function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
+    isempty(bytes) && return UTF32String(Char[0])
+    length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
+    data = reinterpret(Char, bytes)
+    # check for byte-order mark (BOM):
+    if data[1] == Char(0x0000feff) # native byte order
+        d = Array(Char, length(data))
+        copy!(d,1, data, 2, length(data)-1)
+    elseif data[1] == Char(0xfffe0000) # byte-swapped
+        d = Array(Char, length(data))
+        @inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end
+    else
+        d = Array(Char, length(data) + 1)
+        copy!(d, 1, data, 1, length(data)) # assume native byte order
+    end
+    d[end] = 0 # NULL terminate
+    UTF32String(d)
+end
+
+function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
+    for i=1:length(str)
+        @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
+    end
+    return true
+end
+isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
+
+utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
+utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
+function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
+    len = 0
+    while unsafe_load(p, len+1) != 0; len += 1; end
+    utf32(p, len)
+end
+
+function map(f, s::UTF32String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0
+
+    @inbounds for i = 1:(length(d)-1)
+        c2 = f(d[i])
+        if !isa(c2, Char)
+            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
+        end
+        out[i] = (c2::Char)
+    end
+    UTF32String(out)
+end
diff --git a/base/utf16.jl b/base/utf16.jl
deleted file mode 100644
index 59c1e37cc799a..0000000000000
--- a/base/utf16.jl
+++ /dev/null
@@ -1,155 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-immutable UTF16String <: AbstractString
-    data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
-    function UTF16String(data::Vector{UInt16})
-        if length(data) < 1 || data[end] != 0
-            throw(ArgumentError("UTF16String data must be NULL-terminated"))
-        end
-        new(data)
-    end
-end
-
-utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
-utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
-utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
-utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
-
-function length(s::UTF16String)
-    d = s.data
-    len = length(d) - 1
-    len == 0 && return 0
-    cnum = 0
-    for i = 1:len
-        @inbounds cnum += !utf16_is_trail(d[i])
-    end
-    cnum
-end
-
-function endof(s::UTF16String)
-    d = s.data
-    i = length(d) - 1
-    i == 0 && return i
-    utf16_is_surrogate(d[i]) ? i-1 : i
-end
-
-function next(s::UTF16String, i::Int)
-    if !utf16_is_surrogate(s.data[i])
-        return Char(s.data[i]), i+1
-    elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
-        return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
-    end
-    throw(ArgumentError("invalid UTF-16 character index"))
-end
-
-function reverseind(s::UTF16String, i::Integer)
-    j = length(s.data) - i
-    return Base.utf16_is_trail(s.data[j]) ? j-1 : j
-end
-
-lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
-
-function reverse(s::UTF16String)
-    d =s.data
-    out = similar(d)
-    out[end] = 0 # NULL termination
-    n = length(d)
-    for i = 1:n-1
-        out[i] = d[n-i]
-        if Base.utf16_is_lead(out[i])
-            out[i],out[i-1] = out[i-1],out[i]
-        end
-    end
-    return UTF16String(out)
-end
-
-# TODO: optimize this
-function encode16(s::AbstractString)
-    buf = UInt16[]
-    for ch in s
-        c = reinterpret(UInt32, ch)
-        if c < 0x10000
-            push!(buf, UInt16(c))
-        elseif c <= 0x10ffff
-            push!(buf, UInt16(0xd7c0 + (c>>10)))
-            push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
-        else
-            throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
-        end
-    end
-    push!(buf, 0) # NULL termination
-    UTF16String(buf)
-end
-
-utf16(x) = convert(UTF16String, x)
-convert(::Type{UTF16String}, s::UTF16String) = s
-convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
-convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
-convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
-
-# TODO: optimize this
-convert(::Type{UTF8String}, s::UTF16String) =
-    sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
-
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
-unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
-    convert(Ptr{T}, pointer(s))
-
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    while i < n # check for unpaired surrogates
-        if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
-            i += 2
-        elseif utf16_is_surrogate(data[i])
-            return false
-        else
-            i += 1
-        end
-    end
-    return i > n || !utf16_is_surrogate(data[i])
-end
-
-function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
-    len = length(data)
-    d = Array(UInt16, len + 1)
-    d[end] = 0 # NULL terminate
-    UTF16String(copy!(d,1, data,1, len))
-end
-
-convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
-    convert(T, reshape(data, length(data)))
-
-convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
-    convert(T, reinterpret(UInt16, data))
-
-function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
-    isempty(bytes) && return UTF16String(UInt16[0])
-    isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
-    data = reinterpret(UInt16, bytes)
-    # check for byte-order mark (BOM):
-    if data[1] == 0xfeff        # native byte order
-        d = Array(UInt16, length(data))
-        copy!(d,1, data,2, length(data)-1)
-    elseif data[1] == 0xfffe    # byte-swapped
-        d = Array(UInt16, length(data))
-        for i = 2:length(data)
-            d[i-1] = bswap(data[i])
-        end
-    else
-        d = Array(UInt16, length(data) + 1)
-        copy!(d,1, data,1, length(data)) # assume native byte order
-    end
-    d[end] = 0 # NULL terminate
-    !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
-    UTF16String(d)
-end
-
-utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
-utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
-function utf16(p::Union(Ptr{UInt16}, Ptr{Int16}))
-    len = 0
-    while unsafe_load(p, len+1) != 0; len += 1; end
-    utf16(p, len)
-end
diff --git a/base/utf32.jl b/base/utf32.jl
deleted file mode 100644
index 419e104e33dfb..0000000000000
--- a/base/utf32.jl
+++ /dev/null
@@ -1,118 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-## UTF-32 in the native byte order, i.e. plain old character arrays ##
-
-immutable UTF32String <: DirectIndexString
-    data::Vector{Char} # includes 32-bit NULL termination after string chars
-
-    function UTF32String(a::Vector{Char})
-        if length(a) < 1 || a[end] != Char(0)
-            throw(ArgumentError("UTF32String data must be NULL-terminated"))
-        end
-        new(a)
-    end
-end
-UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
-
-next(s::UTF32String, i::Int) = (s.data[i], i+1)
-endof(s::UTF32String) = length(s.data) - 1
-length(s::UTF32String) = length(s.data) - 1
-
-utf32(x) = convert(UTF32String, x)
-convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
-convert(::Type{UTF32String}, s::UTF32String) = s
-
-function convert(::Type{UTF32String}, s::AbstractString)
-    a = Array(Char, length(s) + 1)
-    i = 0
-    for c in s
-        a[i += 1] = c
-    end
-    a[end] = Char(0) # NULL terminate
-    UTF32String(a)
-end
-
-function convert(::Type{UTF32String}, data::AbstractVector{Char})
-    len = length(data)
-    d = Array(Char, len + 1)
-    d[end] = Char(0) # NULL terminate
-    UTF32String(copy!(d,1, data,1, len))
-end
-
-convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
-    convert(UTF32String, reinterpret(Char, data))
-
-convert{T<:AbstractString}(::Type{T}, v::AbstractVector{Char}) = convert(T, utf32(v))
-
-# specialize for performance reasons:
-function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
-    s = IOBuffer(Array(UInt8,length(data)), true, true)
-    truncate(s,0)
-    for x in data
-        print(s, x)
-    end
-    convert(T, takebuf_string(s))
-end
-
-convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
-convert(::Type{Array{Char}}, s::UTF32String) = s.data
-
-reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
-
-sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
-unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
-    convert(Ptr{T}, pointer(s))
-
-function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
-    isempty(bytes) && return UTF32String(Char[0])
-    length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
-    data = reinterpret(Char, bytes)
-    # check for byte-order mark (BOM):
-    if data[1] == Char(0x0000feff) # native byte order
-        d = Array(Char, length(data))
-        copy!(d,1, data, 2, length(data)-1)
-    elseif data[1] == Char(0xfffe0000) # byte-swapped
-        d = Array(Char, length(data))
-        for i = 2:length(data)
-            d[i-1] = bswap(data[i])
-        end
-    else
-        d = Array(Char, length(data) + 1)
-        copy!(d, 1, data, 1, length(data)) # assume native byte order
-    end
-    d[end] = Char(0) # NULL terminate
-    UTF32String(d)
-end
-
-function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
-    for i=1:length(str)
-        @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
-    end
-    return true
-end
-isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
-
-utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
-utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
-function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
-    len = 0
-    while unsafe_load(p, len+1) != 0; len += 1; end
-    utf32(p, len)
-end
-
-function map(f, s::UTF32String)
-    d = s.data
-    out = similar(d)
-    out[end] = Char(0)
-
-    for i = 1:(length(d)-1)
-        c2 = f(d[i])
-        if !isa(c2, Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
-        end
-        out[i] = (c2::Char)
-    end
-    UTF32String(out)
-end
diff --git a/test/strings.jl b/test/strings.jl
index b8f1a42f76983..890bc74c84226 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -1638,3 +1638,152 @@ d = UTF32String(c)
 c[1] = 'A'
 @test d=="A"
 
+# issue #11004 (#10959)
+
+function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
+    @test utf16(strUTF8) == strUTF16
+    @test utf32(strUTF8) == strUTF32
+    @test utf8(strUTF16) == strUTF8
+    @test utf32(strUTF16) == strUTF32
+    @test utf8(strUTF32)  == strUTF8
+    @test utf16(strUTF32) == strUTF16
+end
+
+# Create some ASCII, UTF8, UTF16, and UTF32 strings
+strAscii = "abcdefgh"
+strA_UTF8 = ("abcdefgh\uff")[1:8]
+strL_UTF8 = "abcdef\uff\uff"
+str2_UTF8 = "abcd\uff\uff\u7ff\u7ff"
+str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
+str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
+strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
+strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
+strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
+strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
+
+strA_UTF16 = utf16(strA_UTF8)
+strL_UTF16 = utf16(strL_UTF8)
+str2_UTF16 = utf16(str2_UTF8)
+str3_UTF16 = utf16(str3_UTF8)
+str4_UTF16 = utf16(str4_UTF8)
+strS_UTF16 = utf16(strS_UTF8)
+strA_UTF32 = utf32(strA_UTF8)
+strL_UTF32 = utf32(strL_UTF8)
+str2_UTF32 = utf32(str2_UTF8)
+str3_UTF32 = utf32(str3_UTF8)
+str4_UTF32 = utf32(str4_UTF8)
+strS_UTF32 = utf32(strS_UTF8)
+@test utf8(strAscii) == strAscii
+@test utf16(strAscii) == strAscii
+@test utf32(strAscii) == strAscii
+tstcvt(strA_UTF8,strA_UTF16,strA_UTF32)
+tstcvt(strL_UTF8,strL_UTF16,strL_UTF32)
+tstcvt(str2_UTF8,str2_UTF16,str2_UTF32)
+tstcvt(str3_UTF8,str3_UTF16,str3_UTF32)
+tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
+# Test converting surrogate pairs
+@test utf16(strS_UTF8) == strC_UTF8
+@test utf32(strS_UTF8) == strC_UTF8
+@test utf8(strS_UTF16) == strC_UTF8
+@test utf32(strS_UTF16) == strC_UTF8
+@test utf8(strS_UTF32)  == strC_UTF8
+@test utf16(strS_UTF32) == strC_UTF8
+
+# Test converting overlong \0
+# @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
+@test utf16(strZ_UTF8) == strz_UTF8
+@test utf32(strZ_UTF8) == strz_UTF8
+
+# Test invalid sequences
+
+byt = 0x0
+for T in (UTF16String, UTF32String)
+    try
+    # Continuation byte not after lead
+    for byt in 0x80:0xbf
+        @test_throws ArgumentError convert(T,  UTF8String(UInt8[byt]))
+    end
+
+    # Test lead bytes
+    for byt in 0xc0:0xff
+        # Single lead byte at end of string
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt]))
+        # Lead followed by non-continuation character < 0x80
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0]))
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0xc0]))
+    end
+
+    # Test overlong 2-byte
+    for byt in 0x81:0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[0xc0,byt]))
+    end
+    for byt in 0x80:0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[0xc1,byt]))
+    end
+
+    # Test overlong 3-byte
+    for byt in 0x80:0x9f
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
+    end
+
+    # Test overlong 4-byte
+    for byt in 0x80:0x8f
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
+    end
+
+    # Test 4-byte > 0x10ffff
+    for byt in 0x90:0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
+    end
+    for byt in 0xf5:0xf7
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
+    end
+
+    # Test 5-byte
+    for byt in 0xf8:0xfb
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
+    end
+
+    # Test 6-byte
+    for byt in 0xfc:0xfd
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
+    end
+
+    # Test 7-byte
+    @test_throws ArgumentError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
+
+    # Three and above byte sequences
+    for byt in 0xe0:0xef
+        # Lead followed by only 1 continuation byte
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80]))
+        # Lead ended by non-continuation character < 0x80
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0]))
+        # Lead ended by non-continuation character > 0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
+    end
+
+    # 3-byte encoded surrogate character(s)
+    # Single surrogate
+    @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
+    # Not followed by surrogate
+    @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
+    # Trailing surrogate first
+    @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
+    # Followed by lead surrogate
+    @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
+
+    # Four byte sequences
+    for byt in 0xf0:0xf4
+        # Lead followed by only 2 continuation bytes
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
+        # Lead followed by non-continuation character < 0x80
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
+    end
+    catch exp ;
+        println("Error checking $T: $byt")
+        throw(exp)
+    end
+end

From 982cfbc1c626850e73eaa859a89f9b40848c979c Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Sat, 30 May 2015 14:28:58 +0200
Subject: [PATCH 2/3] Make changes based on various reviewers good advice

---
 base/utf.jl | 291 +++++++++++++++++++++++++---------------------------
 1 file changed, 139 insertions(+), 152 deletions(-)

diff --git a/base/utf.jl b/base/utf.jl
index b170973406b43..1e70d8a58fc79 100644
--- a/base/utf.jl
+++ b/base/utf.jl
@@ -23,10 +23,10 @@ const errMsgs = [
     "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
     "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
     "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
-    "not a leading Unicode surrogate character at index <<1>> (0x<<2>>)",
-    "not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)",
-    "not a valid Unicode surrogate character at index <<1>> (0x<<2>>",
-    "missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)",
+    "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
+    "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
+    "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>",
+    "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)",
     "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
     "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
     "UTF16String data must be NULL-terminated",
@@ -87,7 +87,7 @@ const empty_utf32 = UTF32String(UInt32[0])
 
 is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
 is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
-is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
+is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
 is_valid_continuation(c) = ((c & 0xc0) == 0x80)
 
 function length(s::UTF16String)
@@ -105,14 +105,14 @@ function endof(s::UTF16String)
     d = s.data
     i = length(d) - 1
     i == 0 && return i
-    return is_surrogate_char(d[i]) ? i-1 : i
+    return is_surrogate_codepoint(d[i]) ? i-1 : i
 end
 
 get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
 
 function next(s::UTF16String, i::Int)
     ch = s.data[i]
-    !is_surrogate_char(ch) && return (Char(ch), i+1)
+    !is_surrogate_codepoint(ch) && return (Char(ch), i+1)
     # check length, account for terminating \0
     i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
     !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
@@ -160,15 +160,14 @@ const UTF_UNICODE4 = 16         # non-BMP characters present
 const UTF_SURROGATE = 32        # surrogate pairs present
 
 # Get a UTF-8 continuation byte, give error if invalid, and update position and character value
-@inline function get_continuation(ch::UInt32, str, pos)
-    byt::UInt8 = str[pos += 1]
+@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
     !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
-    (ch << 6) | (byt & 0x3f), pos
+    (ch << 6) | (byt & 0x3f)
 end
 
 #=
 @doc """
-@brief      Validates and calculates number of characters in a string
+@brief      Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
 
 @param[in]  str     Vector of UInt8
 @param[in]  options flags to determine error handling (default 0)
@@ -178,9 +177,9 @@ end
 """ ->
 =#
 function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
-    local byt::UInt8
-    local ch::UInt32, surr::UInt32
-    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
+    local byt::UInt8, ch::UInt32, surr::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
     pos = 0
     len = sizeof(dat)
     @inbounds while pos < len
@@ -191,7 +190,7 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
             if ch < 0xe0
                 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
                 (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch, pos = get_continuation(ch & 0x3f, dat, pos)
+                ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
                 if ch > 0x7f
                     num2byte += 1
                     flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
@@ -205,16 +204,17 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
              elseif ch < 0xf0
                 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
                 (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch, pos = get_continuation(ch & 0x0f, dat, pos)
-                ch, pos = get_continuation(ch, dat, pos)
+                ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
                 # check for surrogate pairs, make sure correct
-                if is_surrogate_char(ch)
+                if is_surrogate_codepoint(ch)
                     !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
                     # next character *must* be a trailing surrogate character
                     (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
-                    byt = dat[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
-                    surr, pos = get_continuation(0x0000d, dat, pos)
-                    surr, pos = get_continuation(surr, dat, pos)
+                    byt = dat[pos += 1]
+                    (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
+                    surr = get_continuation(0x0000d, dat[pos += 1], pos)
+                    surr = get_continuation(surr, dat[pos += 1], pos)
                     !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
                     (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
                     flags |= UTF_SURROGATE
@@ -230,14 +230,14 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
             elseif ch < 0xf5
                 # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
                 (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch, pos = get_continuation(ch & 0x07, dat, pos)
-                ch, pos = get_continuation(ch, dat, pos)
-                ch, pos = get_continuation(ch, dat, pos)
+                ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
                 if ch > 0x10ffff
                     utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
                 elseif ch > 0xffff
                     num4byte += 1
-                elseif is_surrogate_char(ch)
+                elseif is_surrogate_codepoint(ch)
                     utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
                 elseif (options & UTF_ACCEPT_LONG) != 0
                     # This is an overly long encode character
@@ -255,12 +255,14 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
             end
         end
     end
-    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
 end
 
 #=
 @doc """
-@brief      Validates and calculates number of characters in a UTF-16 string
+@brief      Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
 
 @param[in]  dat     Vector{UInt16}
 @param[in]  options flags to determine error handling (default 0)
@@ -271,8 +273,9 @@ end
 =#
 function check_string_utf16(dat::Vector{UInt16}, len::Int)
     local ch::UInt32
-    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
-    local pos = 0
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
     @inbounds while pos < len
         ch = dat[pos += 1]
         totalchar += 1
@@ -283,7 +286,7 @@ function check_string_utf16(dat::Vector{UInt16}, len::Int)
             elseif ch < 0x800
                 num2byte += 1
                 flags |= UTF_UNICODE2
-            elseif !is_surrogate_char(ch)
+            elseif !is_surrogate_codepoint(ch)
                 num3byte += 1
             elseif is_surrogate_lead(ch)
                 pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
@@ -296,12 +299,14 @@ function check_string_utf16(dat::Vector{UInt16}, len::Int)
             end
         end
     end
-    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
 end
 
 #=
 @doc """
-@brief      Validates and calculates number of characters in a UTF-32 string
+@brief      Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
 
 @param[in]  dat     Vector{UInt32}
 @param[in]  options flags to determine error handling (default 0)
@@ -312,8 +317,9 @@ end
 =#
 function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
     local ch::UInt32
-    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
-    local pos = 0
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
     @inbounds while pos < len
         ch = dat[pos += 1]
         totalchar += 1
@@ -327,7 +333,7 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
             elseif ch > 0xffff
                 (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
                 num4byte += 1
-            elseif !is_surrogate_char(ch)
+            elseif !is_surrogate_codepoint(ch)
                 num3byte += 1
             elseif is_surrogate_lead(ch)
                 pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
@@ -342,14 +348,17 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
             end
         end
     end
-    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
 end
 
 function check_string_abs(str::AbstractString, options::Integer=0)
     local ch::UInt32
-    local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0
-    local pos = start(str)
-    local len = endof(str)
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = start(str)
+    len = endof(str)
     @inbounds while pos < len
         ch, pos = next(str, pos)
         totalchar += 1
@@ -363,7 +372,7 @@ function check_string_abs(str::AbstractString, options::Integer=0)
             elseif ch > 0xffff
                 (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
                 num4byte += 1
-            elseif !is_surrogate_char(ch)
+            elseif !is_surrogate_codepoint(ch)
                 num3byte += 1
             elseif is_surrogate_lead(ch)
                 pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
@@ -378,84 +387,39 @@ function check_string_abs(str::AbstractString, options::Integer=0)
             end
         end
     end
-    totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
 end
 
 # Quickly copy and set trailing \0
-macro return_fast_utf_copy(T1, T2, len, dat)
-    quote
-        @inbounds return $(esc(T1))(setindex!(copy!(Vector{$(esc(T2))}($(esc(len))), $(esc(dat))), 0, $(esc(len))))
-    end
+@inline function fast_utf_copy(T::Type{UInt16}, len, dat)
+    @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len))
 end
-
-# Get rest of character ch from 2-byte UTF-8 sequence in str, update pos and return character
-macro get_utf8_2!(str, pos, ch)
-    quote
-        (($(esc(ch)) & 0x1f) << 6) | ($(esc(str))[$(esc(pos)) += 1] & 0x3f)
-    end
+@inline function fast_utf_copy(T::Type{Char}, len, dat)
+    @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len))
 end
 
-# Get rest of character ch from 3-byte UTF-8 sequence in str, update pos and return character
-macro get_utf8_3!(str, pos, ch)
-    quote
-        ($(esc(pos)) += 2 ;
-         (($(esc(ch)) & 0xf) << 12)
-          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
-          | ($(esc(str))[$(esc(pos))] & 0x3f))
-    end
+# Get rest of character ch from 3-byte UTF-8 sequence in str
+@inline function get_utf8_3(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
 end
 
-# Get rest of character ch from 4-byte UTF-8 sequence in str, update pos and return character
-macro get_utf8_4!(str, pos, ch)
-    quote
-        ($(esc(pos)) += 3 ;
-         (($(esc(ch)) & 0x7) << 18)
-          | (UInt32($(esc(str))[$(esc(pos))-2] & 0x3f) << 12)
-          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
-          | ($(esc(str))[$(esc(pos))] & 0x3f))
-    end
+# Get rest of character ch from 4-byte UTF-8 sequence in dat, update pos and return character
+@inline function get_utf8_4(dat, pos, ch)
+    @inbounds return (((ch & 0x7) << 18)
+                        | (UInt32(dat[pos-2] & 0x3f) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
 end
 
-# Get the trailing surrogate character in UTF-8 from an array, update the position
-macro get_utf8_surr!(str, pos)
-    quote
-        ($(esc(pos)) += 3 ;
-         ((UInt32($(esc(str))[$(esc(pos))-2] & 0xf) << 12)
-          | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6)
-          | ($(esc(str))[$(esc(pos))] & 0x3f)))
-    end
-end
-
-# Output a character as a 2-byte UTF-8 sequence, update the position
-macro output_utf8_2!(buf, out, ch)
-    quote
-        $(esc(buf))[$(esc(out)) += 1] = 0xc0 | ($(esc(ch)) >>> 6)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
-    end
-end
-# Output a character as a 3-byte UTF-8 sequence, update the position
-macro output_utf8_3!(buf, out, ch)
-    quote
-        $(esc(buf))[$(esc(out)) += 1] = 0xe0 | (($(esc(ch)) >>> 12) & 0x3f)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
-    end
-end
 # Output a character as a 4-byte UTF-8 sequence, update the position
-macro output_utf8_4!(buf, out, ch)
-    quote
-        $(esc(buf))[$(esc(out)) += 1] = 0xf0 | ($(esc(ch)) >>> 18)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 12) & 0x3f)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f)
-        $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f)
-    end
-end
-
-# Output a UTF-16 surrogate pair, update the position
-macro output_utf16_surr!(buf, out, ch)
-    quote
-        $(esc(buf))[$(esc(out)) += 1] = UInt16(0xd7c0 + ($(esc(ch)) >>> 10))
-        $(esc(buf))[$(esc(out)) += 1] = UInt16(0xdc00 + ($(esc(ch)) & 0x3ff))
+@inline function output_utf8_4(buf, out, ch)
+    @inbounds begin
+        buf[out + 1] = 0xf0 | (ch >>> 18)
+        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
+        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
+        buf[out + 4] = 0x80 | (ch & 0x3f)
     end
 end
 
@@ -475,11 +439,13 @@ function convert(::Type{UTF16String}, str::AbstractString)
     buf = Vector{UInt16}(len+num4byte+1)
     out = 0
     @inbounds for ch in str
-        c = reinterpret(UInt32, ch)
+        c = UInt32(ch)
         if c < 0x10000
             buf[out += 1] = UInt16(c)
         else
-            @output_utf16_surr!(buf, out, c)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
         end
     end
     @inbounds buf[out + 1] = 0 # NULL termination
@@ -508,9 +474,10 @@ end
 
 #=
 @doc """
-@brief      Converts a UTF-8 encoded string to UTF-16 encoding
+@brief      Converts a UTF8String to a UTF16String
 
-@param[in]  str::Vector{UInt8}
+@param[in]  ::Type{UTF16String}
+@param[in]  str::UTF8String
 
 @return     ::UTF16String
 @throws     ArgumentError
@@ -524,11 +491,11 @@ function convert(::Type{UTF16String}, str::UTF8String)
     len, flags, num4byte = check_string_utf8(dat)
     len += num4byte
     buf = Vector{UInt16}(len+1)
-    buf[len+1] = 0
+    @inbounds buf[len+1] = 0
     # Optimize case where no characters > 0x7f
     flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
-    out::UInt = 0
-    pos::UInt = 0
+    out = 0
+    pos = 0
     @inbounds while out < len
         ch::UInt32 = dat[pos += 1]
         # Handle ASCII characters
@@ -536,14 +503,18 @@ function convert(::Type{UTF16String}, str::UTF8String)
             buf[out += 1] = ch
         # Handle range 0x80-0x7ff
         elseif ch < 0xe0
-            buf[out += 1] = @get_utf8_2!(dat, pos, ch)
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
         # Handle range 0x800-0xffff
         elseif ch < 0xf0
-            buf[out += 1] = @get_utf8_3!(dat, pos, ch)
+            pos += 2
+            buf[out += 1] = get_utf8_3(dat, pos, ch)
         # Handle range 0x10000-0x10ffff
         else
-            ch = @get_utf8_4!(dat, pos, ch)
-            @output_utf16_surr!(buf, out, ch)
+            pos += 3
+            ch = get_utf8_4(dat, pos, ch)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
         end
     end
     UTF16String(buf)
@@ -551,9 +522,10 @@ end
 
 #=
 @doc """
-@brief      Reencodes a UTF-16 or UTF-32 encoded string using UTF-8 encoding
+@brief      Converts a UTF-16 encoded vector of UInt16 to a UTF8String
 
-@param[in]  str::Union(Vector{UInt16}, Vector{UInt32})
+@param[in]  ::Type{UTF8String}
+@param[in]  dat::Vector{UInt16}
 
 @return     ::UTF8String
 @throws     ArgumentError
@@ -573,6 +545,7 @@ end
 @doc """
 @brief      Converts a UTF16String to a UTF8String
 
+@param[in]  ::Type{UTF8String}
 @param[in]  str::UTF16String
 
 @return     ::UTF8String
@@ -592,8 +565,9 @@ end
 
 #=
 @doc """
-@brief      Encodes a vector of UInt32 to a UTF8String
+@brief      Encodes a UTF-32 encoded vector of UInt32 to a UTF8String
 
+@param[in]  ::Type{UTF8String}
 @param[in]  dat::Vector{UInt32}
 
 @return     ::UTF8String
@@ -614,6 +588,7 @@ end
 @doc """
 @brief      Converts a UTF32String to a UTF8String
 
+@param[in]  ::Type{UTF8String}
 @param[in]  str::UTF32String
 
 @return     ::UTF8String
@@ -633,7 +608,7 @@ end
 
 #=
 @doc """
-@brief      Encodes an already validated vector of UInt16 or UInt32 as UTF-8
+@brief      Converts an already validated vector of UInt16 or UInt32 to a UTF8String
 
 @param[in]  T           type (UInt16 or UInt32)
 @param[in]  dat         Vector{T}
@@ -644,8 +619,8 @@ end
 =#
 function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
     buf = Vector{UInt8}(len)
-    out::UInt = 0
-    pos::UInt = 0
+    out = 0
+    pos = 0
     @inbounds while out < len
         ch::UInt32 = dat[pos += 1]
         # Handle ASCII characters
@@ -653,17 +628,21 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
             buf[out += 1] = ch
         # Handle 0x80-0x7ff
         elseif ch < 0x800
-            @output_utf8_2!(buf, out, ch)
+            buf[out += 1] = 0xc0 | (ch >>> 6)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
         # Handle 0x10000-0x10ffff (if input is UInt32)
         elseif T == UInt32 && ch > 0xffff
-            @output_utf8_4!(buf, out, ch)
+            output_utf8_4(buf, out, ch)
+            out += 4
         # Handle surrogate pairs
-        elseif is_surrogate_char(ch)
-            ch = get_supplementary(ch, dat[pos += 1])
-            @output_utf8_4!(buf, out, ch)
+        elseif is_surrogate_codepoint(ch)
+            output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1]))
+            out += 4
         # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
         else
-            @output_utf8_3!(buf, out, ch)
+            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
+            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
         end
     end
     UTF8String(buf)
@@ -671,9 +650,10 @@ end
 
 #=
 """
-@brief      Converts a UTF-8 encoded string to UTF-32 encoding
+@brief      Converts a UTF8String to a UTF32String
 
-@param[in]  dat::Vector{UInt8}
+@param[in]  ::Type{UTF32String}
+@param[in]  str::UTF8String
 
 @return     ::UTF32String
 @throws     ArgumentError
@@ -687,11 +667,11 @@ function convert(::Type{UTF32String}, str::UTF8String)
     len, flags = check_string_utf8(dat)
     # Optimize case where no characters > 0x7f
     totlen = len+1
-    flags == 0 && @return_fast_utf_copy(UTF32String, Char, totlen, dat)
+    flags == 0 && return fast_utf_copy(Char, totlen, dat)
     # has multi-byte UTF-8 sequences
     buf = Vector{Char}(totlen)
     @inbounds buf[totlen] = 0 # NULL termination
-    local ch::UInt32
+    local ch::UInt32, surr::UInt32
     out = 0
     pos = 0
     @inbounds while out < len
@@ -701,19 +681,25 @@ function convert(::Type{UTF32String}, str::UTF8String)
             buf[out += 1] = ch
         # Handle range 0x80-0x7ff
         elseif ch < 0xe0
-            buf[out += 1] = @get_utf8_2!(dat, pos, ch)
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
         # Handle range 0x800-0xffff
         elseif ch < 0xf0
-            ch = @get_utf8_3!(dat, pos, ch)
+            pos += 2
+            ch = get_utf8_3(dat, pos, ch)
             # Handle surrogate pairs (should have been encoded in 4 bytes)
             if is_surrogate_lead(ch)
                 # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
-                ch = get_supplementary(ch, @get_utf8_surr!(dat, pos))
+                pos += 3
+                surr = ((UInt32(dat[pos-2] & 0xf) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+                ch = get_supplementary(ch, surr)
             end
             buf[out += 1] = ch
         # Handle range 0x10000-0x10ffff
         else
-            buf[out += 1] = @get_utf8_4!(dat, pos, ch)
+            pos += 3
+            buf[out += 1] = get_utf8_4(dat, pos, ch)
         end
     end
     UTF32String(buf)
@@ -723,6 +709,7 @@ end
 """
 @brief      Converts a UTF16String to UTF32String
 
+@param[in]  ::Type{UTF32String}
 @param[in]  str::UTF16String
 
 @return     ::UTF32String
@@ -753,8 +740,9 @@ end
 
 #=
 """
-@brief      Converts a Vector of UInt32 to a UTF16String
+@brief      Converts a UTF-32 encoded vector of UInt32 to a UTF16String
 
+@param[in]  ::Type{UTF16String}
 @param[in]  dat::Vector{UInt32}
 
 @return     ::UTF16String
@@ -769,7 +757,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
     len, flags, num4byte = check_string_utf32(dat, len>>>2)
     len += num4byte + 1
     # optimized path, no surrogates
-    num4byte == 0 && @return_fast_utf_copy(UTF16String, UInt16, len, dat)
+    num4byte == 0 && return fast_utf_copy(UInt16, len, dat)
     return encode_to_utf16(dat, len)
 end
 
@@ -777,6 +765,7 @@ end
 """
 @brief      Converts a UTF32String to UTF16String
 
+@param[in]  ::Type{UTF16String}
 @param[in]  str::UTF32String
 
 @return     ::UTF16String
@@ -797,10 +786,10 @@ end
 
 #=
 @doc """
-@brief      Encodes an already validated Vector of UInt32 as UTF-16
+@brief      Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
 
-@param[in]  dat         Vector{UInt32}
-@param[in]  len         length of output in 16-bit words
+@param[in]  dat::Vector{UInt32} UTF-32 encoded data
+@param[in]  len                 length of output in 16-bit words
 
 @return     ::UTF16String
 """ ->
@@ -828,14 +817,12 @@ utf16(x) = convert(UTF16String, x)
 
 function convert(::Type{UTF16String}, str::ASCIIString)
     dat = str.data
-    len = length(dat)+1
-    @return_fast_utf_copy(UTF16String, UInt16, len, dat)
+    fast_utf_copy(UInt16, length(dat)+1, dat)
 end
 
 function convert(::Type{UTF32String}, str::ASCIIString)
     dat = str.data
-    len = length(dat)+1
-    @return_fast_utf_copy(UTF32String, Char, len, dat)
+    fast_utf_copy(Char, length(dat)+1, dat)
 end
 
 convert(::Type{UTF16String}, str::UTF16String)    = str
@@ -860,13 +847,13 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
     @inbounds while i < n # check for unpaired surrogates
         if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
             i += 2
-        elseif is_surrogate_char(data[i])
+        elseif is_surrogate_codepoint(data[i])
             return false
         else
             i += 1
         end
     end
-    return i > n || !is_surrogate_char(data[i])
+    return i > n || !is_surrogate_codepoint(data[i])
 end
 
 function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
@@ -931,8 +918,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
     convert(T, takebuf_string(s))
 end
 
-convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
-convert(::Type{Array{Char}},   s::UTF32String) = s.data
+convert(::Type{Vector{Char}}, str::UTF32String) = str.data
+convert(::Type{Array{Char}},  str::UTF32String) = str.data
 
 reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
 
@@ -961,7 +948,7 @@ end
 
 function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
     for i=1:length(str)
-        @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
+        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
     end
     return true
 end

From b159907373f507d1b77d3c2fb4de4cbb8f639a18 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Tue, 2 Jun 2015 12:40:51 +0200
Subject: [PATCH 3/3] Reorganize UTF handling files

---
 base/sysimg.jl                 |   7 +-
 base/utf16.jl                  |  72 ++++++
 base/utf32.jl                  |  33 +++
 base/utfcheck.jl               | 255 +++++++++++++++++++
 base/{utf.jl => utfconvert.jl} | 436 +--------------------------------
 base/utferror.jl               |  51 ++++
 base/utftype.jl                |  39 +++
 7 files changed, 457 insertions(+), 436 deletions(-)
 create mode 100644 base/utf16.jl
 create mode 100644 base/utf32.jl
 create mode 100644 base/utfcheck.jl
 rename base/{utf.jl => utfconvert.jl} (51%)
 create mode 100644 base/utferror.jl
 create mode 100644 base/utftype.jl

diff --git a/base/sysimg.jl b/base/sysimg.jl
index b44340b0216a9..a8a6b1fb706bb 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -84,10 +84,15 @@ include("iterator.jl")
 include("osutils.jl")
 
 # strings & printing
+include("utferror.jl")
+include("utftype.jl")
+include("utfcheck.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")
-include("utf.jl")
+include("utf16.jl")
+include("utf32.jl")
+include("utfconvert.jl")
 include("iobuffer.jl")
 include("string.jl")
 include("utf8proc.jl")
diff --git a/base/utf16.jl b/base/utf16.jl
new file mode 100644
index 0000000000000..dd3358f36dd9c
--- /dev/null
+++ b/base/utf16.jl
@@ -0,0 +1,72 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+function length(s::UTF16String)
+    d = s.data
+    len = length(d) - 1
+    len == 0 && return 0
+    cnum = 0
+    for i = 1:len
+        @inbounds cnum += !is_surrogate_trail(d[i])
+    end
+    cnum
+end
+
+function endof(s::UTF16String)
+    d = s.data
+    i = length(d) - 1
+    i == 0 && return i
+    return is_surrogate_codepoint(d[i]) ? i-1 : i
+end
+
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
+function next(s::UTF16String, i::Int)
+    ch = s.data[i]
+    !is_surrogate_codepoint(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
+    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
+    Char(get_supplementary(ch, ct)), i+2
+end
+
+function reverseind(s::UTF16String, i::Integer)
+    j = length(s.data) - i
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
+end
+
+lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
+
+function reverse(s::UTF16String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0 # NULL termination
+    n = length(d)
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
+        end
+    end
+    UTF16String(out)
+end
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_codepoint(data[i])
+            return false
+        else
+            i += 1
+        end
+    end
+    return i > n || !is_surrogate_codepoint(data[i])
+end
diff --git a/base/utf32.jl b/base/utf32.jl
new file mode 100644
index 0000000000000..444b4d1bab4fe
--- /dev/null
+++ b/base/utf32.jl
@@ -0,0 +1,33 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# UTF-32 basic functions
+next(s::UTF32String, i::Int) = (s.data[i], i+1)
+endof(s::UTF32String) = length(s.data) - 1
+length(s::UTF32String) = length(s.data) - 1
+
+reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+
+sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
+
+function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
+    for i=1:length(str)
+        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
+    end
+    return true
+end
+isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
+
+function map(f, s::UTF32String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0
+
+    @inbounds for i = 1:(length(d)-1)
+        c2 = f(d[i])
+        if !isa(c2, Char)
+            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
+        end
+        out[i] = (c2::Char)
+    end
+    UTF32String(out)
+end
diff --git a/base/utfcheck.jl b/base/utfcheck.jl
new file mode 100644
index 0000000000000..083ca2f09f299
--- /dev/null
+++ b/base/utfcheck.jl
@@ -0,0 +1,255 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
+# and also to return information necessary to convert to other encodings
+
+is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
+is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
+is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
+# Options for check_string_* functions
+
+const UTF_NO_LONG_NULL = 1      # don't accept 0xc0 0x80 for '\0'
+const UTF_NO_SURROGATES = 2     # don't accept surrogate pairs in UTF-8/UTF-32
+const UTF_ACCEPT_LONG = 4       # accept long encodings (other than long null in UTF-8)
+
+const UTF_LONG = 1              # Long encodings are present
+const UTF_LATIN1 = 2            # characters in range 0x80-0xFF present
+const UTF_UNICODE2 = 4          # characters in range 0x100-0x7ff present
+const UTF_UNICODE3 = 8          # characters in range 0x800-0xd7ff, 0xe000-0xffff
+const UTF_UNICODE4 = 16         # non-BMP characters present
+const UTF_SURROGATE = 32        # surrogate pairs present
+
+# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
+@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
+    !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
+    (ch << 6) | (byt & 0x3f)
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
+
+@param[in]  str     Vector of UInt8
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
+    local byt::UInt8, ch::UInt32, surr::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    len = sizeof(dat)
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            # Check UTF-8 encoding
+            if ch < 0xe0
+                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
+                (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
+                if ch > 0x7f
+                    num2byte += 1
+                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
+                    flags |= UTF_LONG
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos, ch)
+                end
+             elseif ch < 0xf0
+                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
+                (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                # check for surrogate pairs, make sure correct
+                if is_surrogate_codepoint(ch)
+                    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
+                    # next character *must* be a trailing surrogate character
+                    (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
+                    byt = dat[pos += 1]
+                    (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
+                    surr = get_continuation(0x0000d, dat[pos += 1], pos)
+                    surr = get_continuation(surr, dat[pos += 1], pos)
+                    !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
+                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
+                    flags |= UTF_SURROGATE
+                    num4byte += 1
+                elseif ch > 0x07ff
+                    num3byte += 1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                    num2byte += 1
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            elseif ch < 0xf5
+                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
+                (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                if ch > 0x10ffff
+                    utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
+                elseif ch > 0xffff
+                    num4byte += 1
+                elseif is_surrogate_codepoint(ch)
+                    utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    # This is an overly long encode character
+                    flags |= UTF_LONG
+                    if ch > 0x7ff
+                        num3byte += 1
+                    elseif ch > 0x7f
+                        num2byte += 1
+                    end
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            else
+                utf_errfunc(UTF_ERR_INVALID, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
+
+@param[in]  dat     Vector{UInt16}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf16(dat::Vector{UInt16}, len::Int)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif !is_surrogate_codepoint(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
+
+@param[in]  dat     Vector{UInt32}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_codepoint(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+function check_string_abs(str::AbstractString, options::Integer=0)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = start(str)
+    len = endof(str)
+    @inbounds while pos < len
+        ch, pos = next(str, pos)
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_codepoint(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch, pos = next(str, pos)
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
diff --git a/base/utf.jl b/base/utfconvert.jl
similarity index 51%
rename from base/utf.jl
rename to base/utfconvert.jl
index 1e70d8a58fc79..60b08a8da7c9e 100644
--- a/base/utf.jl
+++ b/base/utfconvert.jl
@@ -1,396 +1,6 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-#=
-@doc """
-@brief      Error messages for Unicode / UTF support
-""" ->
-=#
-
-const UTF_ERR_SHORT = 1
-const UTF_ERR_CONT  = 2
-const UTF_ERR_LONG  = 3
-const UTF_ERR_NOT_LEAD = 4
-const UTF_ERR_NOT_TRAIL = 5
-const UTF_ERR_NOT_SURROGATE = 6
-const UTF_ERR_MISSING_SURROGATE = 7
-const UTF_ERR_INVALID = 8
-const UTF_ERR_SURROGATE = 9
-const UTF_ERR_NULL_16_TERMINATE = 10
-const UTF_ERR_NULL_32_TERMINATE = 11
-const UTF_ERR_MAX = 11
-
-const errMsgs = [
-    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
-    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
-    "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
-    "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
-    "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
-    "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>",
-    "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)",
-    "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
-    "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
-    "UTF16String data must be NULL-terminated",
-    "UTF32String data must be NULL-terminated"
-]
-#=
-@doc """
-@brief      Throws ArgumentError with information about the specific error, location, and character
-
-@param[in]  errcode Error code for Unicode error (one of UTF_ERR_*)
-@param[in]  charpos Index of invalid byte or character
-@param[in]  invchar Invalid byte or character
-
-@throws never returns, always throws ArgumentError
-""" ->
-=#
-function utf_errfunc(errcode::Integer, charpos, invchar)
-    if errcode < 1 || errcode > UTF_ERR_MAX
-        throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
-    end
-    throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
-end
-
-#=
-@doc """
-@brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
-""" ->
-=#
-immutable UTF16String <: AbstractString
-    data::Vector{UInt16} # includes 16-bit NULL termination after string chars
-    function UTF16String(data::Vector{UInt16})
-        if length(data) < 1 || data[end] != 0
-            utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0)
-        end
-        new(data)
-    end
-end
-
-#=
-@doc """
-@brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
-""" ->
-=#
-immutable UTF32String <: DirectIndexString
-    data::Vector{Char} # includes 32-bit NULL termination after string chars
-
-    function UTF32String(data::Vector{Char})
-        if length(data) < 1 || data[end] != Char(0)
-            utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0)
-        end
-        new(data)
-    end
-end
-UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
-
-const empty_utf16 = UTF16String(UInt16[0])
-const empty_utf32 = UTF32String(UInt32[0])
-
-is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
-is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
-is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
-is_valid_continuation(c) = ((c & 0xc0) == 0x80)
-
-function length(s::UTF16String)
-    d = s.data
-    len = length(d) - 1
-    len == 0 && return 0
-    cnum = 0
-    for i = 1:len
-        @inbounds cnum += !is_surrogate_trail(d[i])
-    end
-    cnum
-end
-
-function endof(s::UTF16String)
-    d = s.data
-    i = length(d) - 1
-    i == 0 && return i
-    return is_surrogate_codepoint(d[i]) ? i-1 : i
-end
-
-get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
-
-function next(s::UTF16String, i::Int)
-    ch = s.data[i]
-    !is_surrogate_codepoint(ch) && return (Char(ch), i+1)
-    # check length, account for terminating \0
-    i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
-    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
-    ct = s.data[i+1]
-    !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
-    Char(get_supplementary(ch, ct)), i+2
-end
-
-function reverseind(s::UTF16String, i::Integer)
-    j = length(s.data) - i
-    return is_surrogate_trail(s.data[j]) ? j-1 : j
-end
-
-lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
-
-function reverse(s::UTF16String)
-    d = s.data
-    out = similar(d)
-    out[end] = 0 # NULL termination
-    n = length(d)
-    @inbounds for i = 1:n-1
-        ch = d[n-i]
-        if is_surrogate_lead(ch)
-            out[i],out[i-1] = out[i-1],ch
-        else
-            out[i] = ch
-        end
-    end
-    UTF16String(out)
-end
-
-next(s::UTF32String, i::Int) = (s.data[i], i+1)
-endof(s::UTF32String) = length(s.data) - 1
-length(s::UTF32String) = length(s.data) - 1
-
-const UTF_NO_LONG_NULL = 1      # don't accept 0xc0 0x80 for '\0'
-const UTF_NO_SURROGATES = 2     # don't accept surrogate pairs in UTF-8/UTF-32
-const UTF_ACCEPT_LONG = 4       # accept long encodings (other than long null in UTF-8)
-
-const UTF_LONG = 1              # Long encodings are present
-const UTF_LATIN1 = 2            # characters in range 0x80-0xFF present
-const UTF_UNICODE2 = 4          # characters in range 0x100-0x7ff present
-const UTF_UNICODE3 = 8          # characters in range 0x800-0xd7ff, 0xe000-0xffff
-const UTF_UNICODE4 = 16         # non-BMP characters present
-const UTF_SURROGATE = 32        # surrogate pairs present
-
-# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
-@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
-    !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
-    (ch << 6) | (byt & 0x3f)
-end
-
-#=
-@doc """
-@brief      Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
-
-@param[in]  str     Vector of UInt8
-@param[in]  options flags to determine error handling (default 0)
-
-@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
-@throws     ArgumentError
-""" ->
-=#
-function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
-    local byt::UInt8, ch::UInt32, surr::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
-    len = sizeof(dat)
-    @inbounds while pos < len
-        ch = dat[pos += 1]
-        totalchar += 1
-        if ch > 0x7f
-            # Check UTF-8 encoding
-            if ch < 0xe0
-                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
-                (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
-                if ch > 0x7f
-                    num2byte += 1
-                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
-                elseif (options & UTF_ACCEPT_LONG) != 0
-                    flags |= UTF_LONG
-                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
-                    flags |= UTF_LONG
-                else
-                    utf_errfunc(UTF_ERR_LONG, pos, ch)
-                end
-             elseif ch < 0xf0
-                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
-                (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
-                ch = get_continuation(ch, dat[pos += 1], pos)
-                # check for surrogate pairs, make sure correct
-                if is_surrogate_codepoint(ch)
-                    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
-                    # next character *must* be a trailing surrogate character
-                    (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
-                    byt = dat[pos += 1]
-                    (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
-                    surr = get_continuation(0x0000d, dat[pos += 1], pos)
-                    surr = get_continuation(surr, dat[pos += 1], pos)
-                    !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
-                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
-                    flags |= UTF_SURROGATE
-                    num4byte += 1
-                elseif ch > 0x07ff
-                    num3byte += 1
-                elseif (options & UTF_ACCEPT_LONG) != 0
-                    flags |= UTF_LONG
-                    num2byte += 1
-                else
-                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
-                end
-            elseif ch < 0xf5
-                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
-                (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
-                ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
-                ch = get_continuation(ch, dat[pos += 1], pos)
-                ch = get_continuation(ch, dat[pos += 1], pos)
-                if ch > 0x10ffff
-                    utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
-                elseif ch > 0xffff
-                    num4byte += 1
-                elseif is_surrogate_codepoint(ch)
-                    utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
-                elseif (options & UTF_ACCEPT_LONG) != 0
-                    # This is an overly long encode character
-                    flags |= UTF_LONG
-                    if ch > 0x7ff
-                        num3byte += 1
-                    elseif ch > 0x7f
-                        num2byte += 1
-                    end
-                else
-                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
-                end
-            else
-                utf_errfunc(UTF_ERR_INVALID, pos, ch)
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
-
-#=
-@doc """
-@brief      Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
-
-@param[in]  dat     Vector{UInt16}
-@param[in]  options flags to determine error handling (default 0)
-
-@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
-@throws     ArgumentError
-""" ->
-=#
-function check_string_utf16(dat::Vector{UInt16}, len::Int)
-    local ch::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
-    @inbounds while pos < len
-        ch = dat[pos += 1]
-        totalchar += 1
-        if ch > 0x7f
-            if ch < 0x100
-                num2byte += 1
-                flags |= UTF_LATIN1
-            elseif ch < 0x800
-                num2byte += 1
-                flags |= UTF_UNICODE2
-            elseif !is_surrogate_codepoint(ch)
-                num3byte += 1
-            elseif is_surrogate_lead(ch)
-                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
-                # next character *must* be a trailing surrogate character
-                ch = dat[pos += 1]
-                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
-                num4byte += 1
-            else
-                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
-
-#=
-@doc """
-@brief      Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
-
-@param[in]  dat     Vector{UInt32}
-@param[in]  options flags to determine error handling (default 0)
-
-@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
-@throws     ArgumentError
-""" ->
-=#
-function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
-    local ch::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
-    @inbounds while pos < len
-        ch = dat[pos += 1]
-        totalchar += 1
-        if ch > 0x7f
-            if ch < 0x100
-                num2byte += 1
-                flags |= UTF_LATIN1
-            elseif ch < 0x800
-                num2byte += 1
-                flags |= UTF_UNICODE2
-            elseif ch > 0xffff
-                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
-                num4byte += 1
-            elseif !is_surrogate_codepoint(ch)
-                num3byte += 1
-            elseif is_surrogate_lead(ch)
-                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
-                # next character *must* be a trailing surrogate character
-                ch = dat[pos += 1]
-                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
-                num4byte += 1
-                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
-                flags |= UTF_SURROGATE
-            else
-                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
-
-function check_string_abs(str::AbstractString, options::Integer=0)
-    local ch::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    pos = start(str)
-    len = endof(str)
-    @inbounds while pos < len
-        ch, pos = next(str, pos)
-        totalchar += 1
-        if ch > 0x7f
-            if ch < 0x100
-                num2byte += 1
-                flags |= UTF_LATIN1
-            elseif ch < 0x800
-                num2byte += 1
-                flags |= UTF_UNICODE2
-            elseif ch > 0xffff
-                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
-                num4byte += 1
-            elseif !is_surrogate_codepoint(ch)
-                num3byte += 1
-            elseif is_surrogate_lead(ch)
-                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
-                # next character *must* be a trailing surrogate character
-                ch, pos = next(str, pos)
-                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
-                num4byte += 1
-                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
-                flags |= UTF_SURROGATE
-            else
-                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
+# Functions to convert to different UTF encodings
 
 # Quickly copy and set trailing \0
 @inline function fast_utf_copy(T::Type{UInt16}, len, dat)
@@ -837,25 +447,9 @@ convert(::Type{UTF32String}, str::UTF32String)    = str
 
 convert(::Type{UTF32String}, c::Char)             = UTF32String(Char[c, Char(0)])
 
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
 unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
     convert(Ptr{T}, pointer(s))
 
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    @inbounds while i < n # check for unpaired surrogates
-        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
-            i += 2
-        elseif is_surrogate_codepoint(data[i])
-            return false
-        else
-            i += 1
-        end
-    end
-    return i > n || !is_surrogate_codepoint(data[i])
-end
-
 function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
     !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
     len = length(data)
@@ -921,9 +515,6 @@ end
 convert(::Type{Vector{Char}}, str::UTF32String) = str.data
 convert(::Type{Array{Char}},  str::UTF32String) = str.data
 
-reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
-
-sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
 unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
     convert(Ptr{T}, pointer(s))
 
@@ -946,16 +537,6 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     UTF32String(d)
 end
 
-function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
-    for i=1:length(str)
-        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
-    end
-    return true
-end
-isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
-
 utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
 utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
 function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
@@ -963,18 +544,3 @@ function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
     while unsafe_load(p, len+1) != 0; len += 1; end
     utf32(p, len)
 end
-
-function map(f, s::UTF32String)
-    d = s.data
-    out = similar(d)
-    out[end] = 0
-
-    @inbounds for i = 1:(length(d)-1)
-        c2 = f(d[i])
-        if !isa(c2, Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
-        end
-        out[i] = (c2::Char)
-    end
-    UTF32String(out)
-end
diff --git a/base/utferror.jl b/base/utferror.jl
new file mode 100644
index 0000000000000..796a12187dd3c
--- /dev/null
+++ b/base/utferror.jl
@@ -0,0 +1,51 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#=
+@doc """
+@brief      Error messages for Unicode / UTF support
+""" ->
+=#
+
+const UTF_ERR_SHORT = 1
+const UTF_ERR_CONT  = 2
+const UTF_ERR_LONG  = 3
+const UTF_ERR_NOT_LEAD = 4
+const UTF_ERR_NOT_TRAIL = 5
+const UTF_ERR_NOT_SURROGATE = 6
+const UTF_ERR_MISSING_SURROGATE = 7
+const UTF_ERR_INVALID = 8
+const UTF_ERR_SURROGATE = 9
+const UTF_ERR_NULL_16_TERMINATE = 10
+const UTF_ERR_NULL_32_TERMINATE = 11
+const UTF_ERR_MAX = 11
+
+const errMsgs = [
+    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
+    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
+    "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
+    "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
+    "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)",
+    "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>",
+    "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)",
+    "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
+    "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
+    "UTF16String data must be NULL-terminated",
+    "UTF32String data must be NULL-terminated"
+]
+#=
+@doc """
+@brief      Throws ArgumentError with information about the specific error, location, and character
+
+@param[in]  errcode Error code for Unicode error (one of UTF_ERR_*)
+@param[in]  charpos Index of invalid byte or character
+@param[in]  invchar Invalid byte or character
+
+@throws never returns, always throws ArgumentError
+""" ->
+=#
+function utf_errfunc(errcode::Integer, charpos, invchar)
+    if errcode < 1 || errcode > UTF_ERR_MAX
+        throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
+    end
+    throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
+end
diff --git a/base/utftype.jl b/base/utftype.jl
new file mode 100644
index 0000000000000..019e496b1cb46
--- /dev/null
+++ b/base/utftype.jl
@@ -0,0 +1,39 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#=
+@doc """
+@brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
+""" ->
+=#
+immutable UTF16String <: AbstractString
+    data::Vector{UInt16} # includes 16-bit NULL termination after string chars
+    function UTF16String(data::Vector{UInt16})
+        if length(data) < 1 || data[end] != 0
+            utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0)
+        end
+        new(data)
+    end
+end
+
+#=
+@doc """
+@brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
+""" ->
+=#
+immutable UTF32String <: DirectIndexString
+    data::Vector{Char} # includes 32-bit NULL termination after string chars
+
+    function UTF32String(data::Vector{Char})
+        if length(data) < 1 || data[end] != Char(0)
+            utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0)
+        end
+        new(data)
+    end
+end
+UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
+
+const empty_utf16 = UTF16String(UInt16[0])
+const empty_utf32 = UTF32String(UInt32[0])
+
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)