From dd5b4793faa304802507eb983cf0c98c7ea20d83 Mon Sep 17 00:00:00 2001 From: Scott Paul Jones Date: Wed, 6 May 2015 23:58:38 -0400 Subject: [PATCH 1/3] Fix #10959 bugs with Unicode string conversions Make most common conversions 2-22x Add standard error reporting functionality (which can be extended later on for locale specific error messages) Improve error messages, reporting invalid character position and value Add in-line documentation Add validation functions for `UTF-8`, `UTF-16`, `UTF-32` (and `AbstractString`) Add testing of different `UTF` encoding issues: Long encoding of `\0` (sometimes known as `Modified UTF-8`) Overlong encodings of other values Encoding of surrogate pairs as 2 3-byte sequences (i.e. `CESU-8`) Invalid bytes (`0xF5..0xFF`) Unexpected continuation bytes Lead byte with missing continuation bytes 4-byte sequence starting with `0xF4`, that represents value > `0x10ffff` Allow conversion (but not production) of `Modified UTF-8` and `CESU-8` (used by Java, Oracle, MySQL, and many others) See http://en.wikipedia.org/wiki/UTF-8 for discussion of these issues and more --- base/exports.jl | 1 + base/sysimg.jl | 3 +- base/utf.jl | 993 ++++++++++++++++++++++++++++++++++++++++++++++++ base/utf16.jl | 155 -------- base/utf32.jl | 118 ------ test/strings.jl | 149 ++++++++ 6 files changed, 1144 insertions(+), 275 deletions(-) create mode 100644 base/utf.jl delete mode 100644 base/utf16.jl delete mode 100644 base/utf32.jl diff --git a/base/exports.jl b/base/exports.jl index 7547a09d791a0..660a3b5a5517f 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -164,6 +164,7 @@ export ProcessExitedException, SystemError, TypeError, + UnicodeError, AssertionError, # Global constants and variables diff --git a/base/sysimg.jl b/base/sysimg.jl index 73ead07c577f3..b44340b0216a9 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -87,8 +87,7 @@ include("osutils.jl") include("char.jl") include("ascii.jl") include("utf8.jl") -include("utf16.jl") -include("utf32.jl") +include("utf.jl") include("iobuffer.jl") include("string.jl") include("utf8proc.jl") diff --git a/base/utf.jl b/base/utf.jl new file mode 100644 index 0000000000000..b170973406b43 --- /dev/null +++ b/base/utf.jl @@ -0,0 +1,993 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +#= +@doc """ +@brief Error messages for Unicode / UTF support +""" -> +=# + +const UTF_ERR_SHORT = 1 +const UTF_ERR_CONT = 2 +const UTF_ERR_LONG = 3 +const UTF_ERR_NOT_LEAD = 4 +const UTF_ERR_NOT_TRAIL = 5 +const UTF_ERR_NOT_SURROGATE = 6 +const UTF_ERR_MISSING_SURROGATE = 7 +const UTF_ERR_INVALID = 8 +const UTF_ERR_SURROGATE = 9 +const UTF_ERR_NULL_16_TERMINATE = 10 +const UTF_ERR_NULL_32_TERMINATE = 11 +const UTF_ERR_MAX = 11 + +const errMsgs = [ + "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)", + "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)", + "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)", + "not a leading Unicode surrogate character at index <<1>> (0x<<2>>)", + "not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)", + "not a valid Unicode surrogate character at index <<1>> (0x<<2>>", + "missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)", + "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)", + "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)", + "UTF16String data must be NULL-terminated", + "UTF32String data must be NULL-terminated" +] +#= +@doc """ +@brief Throws ArgumentError with information about the specific error, location, and character + +@param[in] errcode Error code for Unicode error (one of UTF_ERR_*) +@param[in] charpos Index of invalid byte or character +@param[in] invchar Invalid byte or character + +@throws never returns, always throws ArgumentError +""" -> +=# +function utf_errfunc(errcode::Integer, charpos, invchar) + if errcode < 1 || errcode > UTF_ERR_MAX + throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar")) + end + throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar)))) +end + +#= +@doc """ +@brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order +""" -> +=# +immutable UTF16String <: AbstractString + data::Vector{UInt16} # includes 16-bit NULL termination after string chars + function UTF16String(data::Vector{UInt16}) + if length(data) < 1 || data[end] != 0 + utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0) + end + new(data) + end +end + +#= +@doc """ +@brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order +""" -> +=# +immutable UTF32String <: DirectIndexString + data::Vector{Char} # includes 32-bit NULL termination after string chars + + function UTF32String(data::Vector{Char}) + if length(data) < 1 || data[end] != Char(0) + utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0) + end + new(data) + end +end +UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) + +const empty_utf16 = UTF16String(UInt16[0]) +const empty_utf32 = UTF32String(UInt32[0]) + +is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) +is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) +is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800) +is_valid_continuation(c) = ((c & 0xc0) == 0x80) + +function length(s::UTF16String) + d = s.data + len = length(d) - 1 + len == 0 && return 0 + cnum = 0 + for i = 1:len + @inbounds cnum += !is_surrogate_trail(d[i]) + end + cnum +end + +function endof(s::UTF16String) + d = s.data + i = length(d) - 1 + i == 0 && return i + return is_surrogate_char(d[i]) ? i-1 : i +end + +get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) + +function next(s::UTF16String, i::Int) + ch = s.data[i] + !is_surrogate_char(ch) && return (Char(ch), i+1) + # check length, account for terminating \0 + i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)) + !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch) + ct = s.data[i+1] + !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch) + Char(get_supplementary(ch, ct)), i+2 +end + +function reverseind(s::UTF16String, i::Integer) + j = length(s.data) - i + return is_surrogate_trail(s.data[j]) ? j-1 : j +end + +lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator + +function reverse(s::UTF16String) + d = s.data + out = similar(d) + out[end] = 0 # NULL termination + n = length(d) + @inbounds for i = 1:n-1 + ch = d[n-i] + if is_surrogate_lead(ch) + out[i],out[i-1] = out[i-1],ch + else + out[i] = ch + end + end + UTF16String(out) +end + +next(s::UTF32String, i::Int) = (s.data[i], i+1) +endof(s::UTF32String) = length(s.data) - 1 +length(s::UTF32String) = length(s.data) - 1 + +const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' +const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 +const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) + +const UTF_LONG = 1 # Long encodings are present +const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present +const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present +const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff +const UTF_UNICODE4 = 16 # non-BMP characters present +const UTF_SURROGATE = 32 # surrogate pairs present + +# Get a UTF-8 continuation byte, give error if invalid, and update position and character value +@inline function get_continuation(ch::UInt32, str, pos) + byt::UInt8 = str[pos += 1] + !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt) + (ch << 6) | (byt & 0x3f), pos +end + +#= +@doc """ +@brief Validates and calculates number of characters in a string + +@param[in] str Vector of UInt8 +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) + local byt::UInt8 + local ch::UInt32, surr::UInt32 + local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 + pos = 0 + len = sizeof(dat) + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + # Check UTF-8 encoding + if ch < 0xe0 + # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) + (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch, pos = get_continuation(ch & 0x3f, dat, pos) + if ch > 0x7f + num2byte += 1 + flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 + elseif (options & UTF_ACCEPT_LONG) != 0 + flags |= UTF_LONG + elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) + flags |= UTF_LONG + else + utf_errfunc(UTF_ERR_LONG, pos, ch) + end + elseif ch < 0xf0 + # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) + (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch, pos = get_continuation(ch & 0x0f, dat, pos) + ch, pos = get_continuation(ch, dat, pos) + # check for surrogate pairs, make sure correct + if is_surrogate_char(ch) + !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) + # next character *must* be a trailing surrogate character + (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) + byt = dat[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) + surr, pos = get_continuation(0x0000d, dat, pos) + surr, pos = get_continuation(surr, dat, pos) + !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) + flags |= UTF_SURROGATE + num4byte += 1 + elseif ch > 0x07ff + num3byte += 1 + elseif (options & UTF_ACCEPT_LONG) != 0 + flags |= UTF_LONG + num2byte += 1 + else + utf_errfunc(UTF_ERR_LONG, pos-2, ch) + end + elseif ch < 0xf5 + # 4-byte UTF-8 sequence (i.e. characters > 0xffff) + (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch, pos = get_continuation(ch & 0x07, dat, pos) + ch, pos = get_continuation(ch, dat, pos) + ch, pos = get_continuation(ch, dat, pos) + if ch > 0x10ffff + utf_errfunc(UTF_ERR_INVALID, pos-3, ch) + elseif ch > 0xffff + num4byte += 1 + elseif is_surrogate_char(ch) + utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) + elseif (options & UTF_ACCEPT_LONG) != 0 + # This is an overly long encode character + flags |= UTF_LONG + if ch > 0x7ff + num3byte += 1 + elseif ch > 0x7f + num2byte += 1 + end + else + utf_errfunc(UTF_ERR_LONG, pos-2, ch) + end + else + utf_errfunc(UTF_ERR_INVALID, pos, ch) + end + end + end + totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte +end + +#= +@doc """ +@brief Validates and calculates number of characters in a UTF-16 string + +@param[in] dat Vector{UInt16} +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf16(dat::Vector{UInt16}, len::Int) + local ch::UInt32 + local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 + local pos = 0 + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif !is_surrogate_char(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch = dat[pos += 1] + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte +end + +#= +@doc """ +@brief Validates and calculates number of characters in a UTF-32 string + +@param[in] dat Vector{UInt32} +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) + local ch::UInt32 + local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 + local pos = 0 + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif ch > 0xffff + (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) + num4byte += 1 + elseif !is_surrogate_char(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch = dat[pos += 1] + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) + flags |= UTF_SURROGATE + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte +end + +function check_string_abs(str::AbstractString, options::Integer=0) + local ch::UInt32 + local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 + local pos = start(str) + local len = endof(str) + @inbounds while pos < len + ch, pos = next(str, pos) + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif ch > 0xffff + (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) + num4byte += 1 + elseif !is_surrogate_char(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch, pos = next(str, pos) + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) + flags |= UTF_SURROGATE + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte +end + +# Quickly copy and set trailing \0 +macro return_fast_utf_copy(T1, T2, len, dat) + quote + @inbounds return $(esc(T1))(setindex!(copy!(Vector{$(esc(T2))}($(esc(len))), $(esc(dat))), 0, $(esc(len)))) + end +end + +# Get rest of character ch from 2-byte UTF-8 sequence in str, update pos and return character +macro get_utf8_2!(str, pos, ch) + quote + (($(esc(ch)) & 0x1f) << 6) | ($(esc(str))[$(esc(pos)) += 1] & 0x3f) + end +end + +# Get rest of character ch from 3-byte UTF-8 sequence in str, update pos and return character +macro get_utf8_3!(str, pos, ch) + quote + ($(esc(pos)) += 2 ; + (($(esc(ch)) & 0xf) << 12) + | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) + | ($(esc(str))[$(esc(pos))] & 0x3f)) + end +end + +# Get rest of character ch from 4-byte UTF-8 sequence in str, update pos and return character +macro get_utf8_4!(str, pos, ch) + quote + ($(esc(pos)) += 3 ; + (($(esc(ch)) & 0x7) << 18) + | (UInt32($(esc(str))[$(esc(pos))-2] & 0x3f) << 12) + | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) + | ($(esc(str))[$(esc(pos))] & 0x3f)) + end +end + +# Get the trailing surrogate character in UTF-8 from an array, update the position +macro get_utf8_surr!(str, pos) + quote + ($(esc(pos)) += 3 ; + ((UInt32($(esc(str))[$(esc(pos))-2] & 0xf) << 12) + | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) + | ($(esc(str))[$(esc(pos))] & 0x3f))) + end +end + +# Output a character as a 2-byte UTF-8 sequence, update the position +macro output_utf8_2!(buf, out, ch) + quote + $(esc(buf))[$(esc(out)) += 1] = 0xc0 | ($(esc(ch)) >>> 6) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) + end +end +# Output a character as a 3-byte UTF-8 sequence, update the position +macro output_utf8_3!(buf, out, ch) + quote + $(esc(buf))[$(esc(out)) += 1] = 0xe0 | (($(esc(ch)) >>> 12) & 0x3f) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) + end +end +# Output a character as a 4-byte UTF-8 sequence, update the position +macro output_utf8_4!(buf, out, ch) + quote + $(esc(buf))[$(esc(out)) += 1] = 0xf0 | ($(esc(ch)) >>> 18) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 12) & 0x3f) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f) + $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) + end +end + +# Output a UTF-16 surrogate pair, update the position +macro output_utf16_surr!(buf, out, ch) + quote + $(esc(buf))[$(esc(out)) += 1] = UInt16(0xd7c0 + ($(esc(ch)) >>> 10)) + $(esc(buf))[$(esc(out)) += 1] = UInt16(0xdc00 + ($(esc(ch)) & 0x3ff)) + end +end + +#= +""" +@brief Converts an AbstractString to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::AbstractString + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::AbstractString) + len, flags, num4byte = check_string_abs(str) + buf = Vector{UInt16}(len+num4byte+1) + out = 0 + @inbounds for ch in str + c = reinterpret(UInt32, ch) + if c < 0x10000 + buf[out += 1] = UInt16(c) + else + @output_utf16_surr!(buf, out, c) + end + end + @inbounds buf[out + 1] = 0 # NULL termination + UTF16String(buf) +end + +#= +""" +@brief Converts an AbstractString to a UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::AbstractString + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::AbstractString) + len, flags = check_string_abs(str) + buf = Vector{Char}(len+1) + out = 0 + @inbounds for ch in str ; buf[out += 1] = ch ; end + @inbounds buf[out + 1] = 0 # NULL termination + UTF32String(buf) +end + +#= +@doc """ +@brief Converts a UTF-8 encoded string to UTF-16 encoding + +@param[in] str::Vector{UInt8} + +@return ::UTF16String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF16String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf16 + # Check that is correct UTF-8 encoding and get number of words needed + len, flags, num4byte = check_string_utf8(dat) + len += num4byte + buf = Vector{UInt16}(len+1) + buf[len+1] = 0 + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) + out::UInt = 0 + pos::UInt = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = @get_utf8_2!(dat, pos, ch) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + buf[out += 1] = @get_utf8_3!(dat, pos, ch) + # Handle range 0x10000-0x10ffff + else + ch = @get_utf8_4!(dat, pos, ch) + @output_utf16_surr!(buf, out, ch) + end + end + UTF16String(buf) +end + +#= +@doc """ +@brief Reencodes a UTF-16 or UTF-32 encoded string using UTF-8 encoding + +@param[in] str::Union(Vector{UInt16}, Vector{UInt32}) + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt16}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF16String to a UTF8String + +@param[in] str::UTF16String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF16String) + dat = str.data + len = sizeof(dat) >>> 1 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Encodes a vector of UInt32 to a UTF8String + +@param[in] dat::Vector{UInt32} + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF32String to a UTF8String + +@param[in] str::UTF32String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) >>> 2 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Encodes an already validated vector of UInt16 or UInt32 as UTF-8 + +@param[in] T type (UInt16 or UInt32) +@param[in] dat Vector{T} +@param[in] len length of output in bytes + +@return ::UTF8String +""" -> +=# +function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) + buf = Vector{UInt8}(len) + out::UInt = 0 + pos::UInt = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle 0x80-0x7ff + elseif ch < 0x800 + @output_utf8_2!(buf, out, ch) + # Handle 0x10000-0x10ffff (if input is UInt32) + elseif T == UInt32 && ch > 0xffff + @output_utf8_4!(buf, out, ch) + # Handle surrogate pairs + elseif is_surrogate_char(ch) + ch = get_supplementary(ch, dat[pos += 1]) + @output_utf8_4!(buf, out, ch) + # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters + else + @output_utf8_3!(buf, out, ch) + end + end + UTF8String(buf) +end + +#= +""" +@brief Converts a UTF-8 encoded string to UTF-32 encoding + +@param[in] dat::Vector{UInt8} + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf32 + # Validate UTF-8 encoding, and get number of words to create + len, flags = check_string_utf8(dat) + # Optimize case where no characters > 0x7f + totlen = len+1 + flags == 0 && @return_fast_utf_copy(UTF32String, Char, totlen, dat) + # has multi-byte UTF-8 sequences + buf = Vector{Char}(totlen) + @inbounds buf[totlen] = 0 # NULL termination + local ch::UInt32 + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = @get_utf8_2!(dat, pos, ch) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + ch = @get_utf8_3!(dat, pos, ch) + # Handle surrogate pairs (should have been encoded in 4 bytes) + if is_surrogate_lead(ch) + # Build up 32-bit character from ch and trailing surrogate in next 3 bytes + ch = get_supplementary(ch, @get_utf8_surr!(dat, pos)) + end + buf[out += 1] = ch + # Handle range 0x10000-0x10ffff + else + buf[out += 1] = @get_utf8_4!(dat, pos, ch) + end + end + UTF32String(buf) +end + +#= +""" +@brief Converts a UTF16String to UTF32String + +@param[in] str::UTF16String + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF16String) + dat = str.data + len = sizeof(dat) + # handle zero length string quickly (account for trailing \0) + len <= 2 && return empty_utf32 + # get number of words to create + len, flags, num4byte = check_string_utf16(dat, len>>>1) + # No surrogate pairs, do optimized copy + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) + local ch::UInt32 + buf = Vector{Char}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # check for surrogate pair + if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end + buf[out += 1] = ch + end + UTF32String(buf) +end + +#= +""" +@brief Converts a Vector of UInt32 to a UTF16String + +@param[in] dat::Vector{UInt32} + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + len += num4byte + 1 + # optimized path, no surrogates + num4byte == 0 && @return_fast_utf_copy(UTF16String, UInt16, len, dat) + return encode_to_utf16(dat, len) +end + +#= +""" +@brief Converts a UTF32String to UTF16String + +@param[in] str::UTF32String + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + # optimized path, no surrogates + num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + return encode_to_utf16(dat, len + num4byte) +end + +#= +@doc """ +@brief Encodes an already validated Vector of UInt32 as UTF-16 + +@param[in] dat Vector{UInt32} +@param[in] len length of output in 16-bit words + +@return ::UTF16String +""" -> +=# +function encode_to_utf16(dat, len) + buf = Vector{UInt16}(len) + @inbounds buf[len] = 0 # NULL termination + out = 0 + pos = 0 + @inbounds while out < len + ch = UInt32(dat[pos += 1]) + if ch > 0xffff + # Output surrogate pair for 0x10000-0x10ffff + buf[out += 1] = 0xd7c0 + (ch >>> 10) + ch = 0xdc00 + (ch & 0x3ff) + end + buf[out += 1] = ch + end + UTF16String(buf) +end + +convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) + +utf16(x) = convert(UTF16String, x) + +function convert(::Type{UTF16String}, str::ASCIIString) + dat = str.data + len = length(dat)+1 + @return_fast_utf_copy(UTF16String, UInt16, len, dat) +end + +function convert(::Type{UTF32String}, str::ASCIIString) + dat = str.data + len = length(dat)+1 + @return_fast_utf_copy(UTF32String, Char, len, dat) +end + +convert(::Type{UTF16String}, str::UTF16String) = str +convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) + +convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data +convert(::Type{Array{UInt16}}, str::UTF16String) = str.data + +utf32(x) = convert(UTF32String, x) + +convert(::Type{UTF32String}, str::UTF32String) = str + +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) + +sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) +unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) = + convert(Ptr{T}, pointer(s)) + +function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) + i = 1 + n = length(data) # this may include NULL termination; that's okay + @inbounds while i < n # check for unpaired surrogates + if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) + i += 2 + elseif is_surrogate_char(data[i]) + return false + else + i += 1 + end + end + return i > n || !is_surrogate_char(data[i]) +end + +function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) + !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) + len = length(data) + @inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1)) +end + +convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = + convert(T, reshape(data, length(data))) + +convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = + convert(T, reinterpret(UInt16, data)) + +function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) + isempty(bytes) && return UTF16String(UInt16[0]) + isodd(length(bytes)) && throw(ArgumentError("odd number of bytes")) + data = reinterpret(UInt16, bytes) + # check for byte-order mark (BOM): + if data[1] == 0xfeff # native byte order + d = Array(UInt16, length(data)) + copy!(d,1, data,2, length(data)-1) + elseif data[1] == 0xfffe # byte-swapped + d = Array(UInt16, length(data)) + for i = 2:length(data) + d[i-1] = bswap(data[i]) + end + else + d = Array(UInt16, length(data) + 1) + copy!(d,1, data,1, length(data)) # assume native byte order + end + d[end] = 0 # NULL terminate + !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data")) + UTF16String(d) +end + +utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len)) +utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) +function utf16(p::Union(Ptr{UInt16}, Ptr{Int16})) + len = 0 + while unsafe_load(p, len+1) != 0; len += 1; end + utf16(p, len) +end + +function convert(::Type{UTF32String}, data::AbstractVector{Char}) + len = length(data) + @inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1)) +end + +convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) = + convert(UTF32String, reinterpret(Char, data)) + +convert{T<:AbstractString}(::Type{T}, v::AbstractVector{Char}) = convert(T, utf32(v)) + +# specialize for performance reasons: +function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char}) + s = IOBuffer(Array(UInt8,length(data)), true, true) + truncate(s,0) + for x in data + print(s, x) + end + convert(T, takebuf_string(s)) +end + +convert(::Type{Array{Char,1}}, s::UTF32String) = s.data +convert(::Type{Array{Char}}, s::UTF32String) = s.data + +reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) + +sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) +unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) = + convert(Ptr{T}, pointer(s)) + +function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) + isempty(bytes) && return UTF32String(Char[0]) + length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes")) + data = reinterpret(Char, bytes) + # check for byte-order mark (BOM): + if data[1] == Char(0x0000feff) # native byte order + d = Array(Char, length(data)) + copy!(d,1, data, 2, length(data)-1) + elseif data[1] == Char(0xfffe0000) # byte-swapped + d = Array(Char, length(data)) + @inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end + else + d = Array(Char, length(data) + 1) + copy!(d, 1, data, 1, length(data)) # assume native byte order + end + d[end] = 0 # NULL terminate + UTF32String(d) +end + +function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) + for i=1:length(str) + @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end + end + return true +end +isvalid(str::Vector{Char}) = isvalid(UTF32String, str) +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) + +utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) +utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) +function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32})) + len = 0 + while unsafe_load(p, len+1) != 0; len += 1; end + utf32(p, len) +end + +function map(f, s::UTF32String) + d = s.data + out = similar(d) + out[end] = 0 + + @inbounds for i = 1:(length(d)-1) + c2 = f(d[i]) + if !isa(c2, Char) + throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) + end + out[i] = (c2::Char) + end + UTF32String(out) +end diff --git a/base/utf16.jl b/base/utf16.jl deleted file mode 100644 index 59c1e37cc799a..0000000000000 --- a/base/utf16.jl +++ /dev/null @@ -1,155 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -immutable UTF16String <: AbstractString - data::Array{UInt16,1} # includes 16-bit NULL termination after string chars - function UTF16String(data::Vector{UInt16}) - if length(data) < 1 || data[end] != 0 - throw(ArgumentError("UTF16String data must be NULL-terminated")) - end - new(data) - end -end - -utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 -utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 -utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 -utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail) - -function length(s::UTF16String) - d = s.data - len = length(d) - 1 - len == 0 && return 0 - cnum = 0 - for i = 1:len - @inbounds cnum += !utf16_is_trail(d[i]) - end - cnum -end - -function endof(s::UTF16String) - d = s.data - i = length(d) - 1 - i == 0 && return i - utf16_is_surrogate(d[i]) ? i-1 : i -end - -function next(s::UTF16String, i::Int) - if !utf16_is_surrogate(s.data[i]) - return Char(s.data[i]), i+1 - elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) - return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 - end - throw(ArgumentError("invalid UTF-16 character index")) -end - -function reverseind(s::UTF16String, i::Integer) - j = length(s.data) - i - return Base.utf16_is_trail(s.data[j]) ? j-1 : j -end - -lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator - -function reverse(s::UTF16String) - d =s.data - out = similar(d) - out[end] = 0 # NULL termination - n = length(d) - for i = 1:n-1 - out[i] = d[n-i] - if Base.utf16_is_lead(out[i]) - out[i],out[i-1] = out[i-1],out[i] - end - end - return UTF16String(out) -end - -# TODO: optimize this -function encode16(s::AbstractString) - buf = UInt16[] - for ch in s - c = reinterpret(UInt32, ch) - if c < 0x10000 - push!(buf, UInt16(c)) - elseif c <= 0x10ffff - push!(buf, UInt16(0xd7c0 + (c>>10))) - push!(buf, UInt16(0xdc00 + (c & 0x3ff))) - else - throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) - end - end - push!(buf, 0) # NULL termination - UTF16String(buf) -end - -utf16(x) = convert(UTF16String, x) -convert(::Type{UTF16String}, s::UTF16String) = s -convert(::Type{UTF16String}, s::AbstractString) = encode16(s) -convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data -convert(::Type{Array{UInt16}}, s::UTF16String) = s.data - -# TODO: optimize this -convert(::Type{UTF8String}, s::UTF16String) = - sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end) - -sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) -unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) = - convert(Ptr{T}, pointer(s)) - -function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) - i = 1 - n = length(data) # this may include NULL termination; that's okay - while i < n # check for unpaired surrogates - if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1]) - i += 2 - elseif utf16_is_surrogate(data[i]) - return false - else - i += 1 - end - end - return i > n || !utf16_is_surrogate(data[i]) -end - -function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) - !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) - len = length(data) - d = Array(UInt16, len + 1) - d[end] = 0 # NULL terminate - UTF16String(copy!(d,1, data,1, len)) -end - -convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = - convert(T, reshape(data, length(data))) - -convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = - convert(T, reinterpret(UInt16, data)) - -function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return UTF16String(UInt16[0]) - isodd(length(bytes)) && throw(ArgumentError("odd number of bytes")) - data = reinterpret(UInt16, bytes) - # check for byte-order mark (BOM): - if data[1] == 0xfeff # native byte order - d = Array(UInt16, length(data)) - copy!(d,1, data,2, length(data)-1) - elseif data[1] == 0xfffe # byte-swapped - d = Array(UInt16, length(data)) - for i = 2:length(data) - d[i-1] = bswap(data[i]) - end - else - d = Array(UInt16, length(data) + 1) - copy!(d,1, data,1, length(data)) # assume native byte order - end - d[end] = 0 # NULL terminate - !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data")) - UTF16String(d) -end - -utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len)) -utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) -function utf16(p::Union(Ptr{UInt16}, Ptr{Int16})) - len = 0 - while unsafe_load(p, len+1) != 0; len += 1; end - utf16(p, len) -end diff --git a/base/utf32.jl b/base/utf32.jl deleted file mode 100644 index 419e104e33dfb..0000000000000 --- a/base/utf32.jl +++ /dev/null @@ -1,118 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -## UTF-32 in the native byte order, i.e. plain old character arrays ## - -immutable UTF32String <: DirectIndexString - data::Vector{Char} # includes 32-bit NULL termination after string chars - - function UTF32String(a::Vector{Char}) - if length(a) < 1 || a[end] != Char(0) - throw(ArgumentError("UTF32String data must be NULL-terminated")) - end - new(a) - end -end -UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) - -next(s::UTF32String, i::Int) = (s.data[i], i+1) -endof(s::UTF32String) = length(s.data) - 1 -length(s::UTF32String) = length(s.data) - 1 - -utf32(x) = convert(UTF32String, x) -convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) -convert(::Type{UTF32String}, s::UTF32String) = s - -function convert(::Type{UTF32String}, s::AbstractString) - a = Array(Char, length(s) + 1) - i = 0 - for c in s - a[i += 1] = c - end - a[end] = Char(0) # NULL terminate - UTF32String(a) -end - -function convert(::Type{UTF32String}, data::AbstractVector{Char}) - len = length(data) - d = Array(Char, len + 1) - d[end] = Char(0) # NULL terminate - UTF32String(copy!(d,1, data,1, len)) -end - -convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) = - convert(UTF32String, reinterpret(Char, data)) - -convert{T<:AbstractString}(::Type{T}, v::AbstractVector{Char}) = convert(T, utf32(v)) - -# specialize for performance reasons: -function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char}) - s = IOBuffer(Array(UInt8,length(data)), true, true) - truncate(s,0) - for x in data - print(s, x) - end - convert(T, takebuf_string(s)) -end - -convert(::Type{Array{Char,1}}, s::UTF32String) = s.data -convert(::Type{Array{Char}}, s::UTF32String) = s.data - -reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) - -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) -unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) = - convert(Ptr{T}, pointer(s)) - -function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return UTF32String(Char[0]) - length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes")) - data = reinterpret(Char, bytes) - # check for byte-order mark (BOM): - if data[1] == Char(0x0000feff) # native byte order - d = Array(Char, length(data)) - copy!(d,1, data, 2, length(data)-1) - elseif data[1] == Char(0xfffe0000) # byte-swapped - d = Array(Char, length(data)) - for i = 2:length(data) - d[i-1] = bswap(data[i]) - end - else - d = Array(Char, length(data) + 1) - copy!(d, 1, data, 1, length(data)) # assume native byte order - end - d[end] = Char(0) # NULL terminate - UTF32String(d) -end - -function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) - for i=1:length(str) - @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end - end - return true -end -isvalid(str::Vector{Char}) = isvalid(UTF32String, str) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) - -utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) -utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) -function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32})) - len = 0 - while unsafe_load(p, len+1) != 0; len += 1; end - utf32(p, len) -end - -function map(f, s::UTF32String) - d = s.data - out = similar(d) - out[end] = Char(0) - - for i = 1:(length(d)-1) - c2 = f(d[i]) - if !isa(c2, Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) - end - out[i] = (c2::Char) - end - UTF32String(out) -end diff --git a/test/strings.jl b/test/strings.jl index b8f1a42f76983..890bc74c84226 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1638,3 +1638,152 @@ d = UTF32String(c) c[1] = 'A' @test d=="A" +# issue #11004 (#10959) + +function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) + @test utf16(strUTF8) == strUTF16 + @test utf32(strUTF8) == strUTF32 + @test utf8(strUTF16) == strUTF8 + @test utf32(strUTF16) == strUTF32 + @test utf8(strUTF32) == strUTF8 + @test utf16(strUTF32) == strUTF16 +end + +# Create some ASCII, UTF8, UTF16, and UTF32 strings +strAscii = "abcdefgh" +strA_UTF8 = ("abcdefgh\uff")[1:8] +strL_UTF8 = "abcdef\uff\uff" +str2_UTF8 = "abcd\uff\uff\u7ff\u7ff" +str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" +str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" +strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") +strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") +strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") +strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") + +strA_UTF16 = utf16(strA_UTF8) +strL_UTF16 = utf16(strL_UTF8) +str2_UTF16 = utf16(str2_UTF8) +str3_UTF16 = utf16(str3_UTF8) +str4_UTF16 = utf16(str4_UTF8) +strS_UTF16 = utf16(strS_UTF8) +strA_UTF32 = utf32(strA_UTF8) +strL_UTF32 = utf32(strL_UTF8) +str2_UTF32 = utf32(str2_UTF8) +str3_UTF32 = utf32(str3_UTF8) +str4_UTF32 = utf32(str4_UTF8) +strS_UTF32 = utf32(strS_UTF8) +@test utf8(strAscii) == strAscii +@test utf16(strAscii) == strAscii +@test utf32(strAscii) == strAscii +tstcvt(strA_UTF8,strA_UTF16,strA_UTF32) +tstcvt(strL_UTF8,strL_UTF16,strL_UTF32) +tstcvt(str2_UTF8,str2_UTF16,str2_UTF32) +tstcvt(str3_UTF8,str3_UTF16,str3_UTF32) +tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) +# Test converting surrogate pairs +@test utf16(strS_UTF8) == strC_UTF8 +@test utf32(strS_UTF8) == strC_UTF8 +@test utf8(strS_UTF16) == strC_UTF8 +@test utf32(strS_UTF16) == strC_UTF8 +@test utf8(strS_UTF32) == strC_UTF8 +@test utf16(strS_UTF32) == strC_UTF8 + +# Test converting overlong \0 +# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) +@test utf16(strZ_UTF8) == strz_UTF8 +@test utf32(strZ_UTF8) == strz_UTF8 + +# Test invalid sequences + +byt = 0x0 +for T in (UTF16String, UTF32String) + try + # Continuation byte not after lead + for byt in 0x80:0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt])) + end + + # Test lead bytes + for byt in 0xc0:0xff + # Single lead byte at end of string + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt])) + # Lead followed by non-continuation character < 0x80 + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0xc0])) + end + + # Test overlong 2-byte + for byt in 0x81:0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xc0,byt])) + end + for byt in 0x80:0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xc1,byt])) + end + + # Test overlong 3-byte + for byt in 0x80:0x9f + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) + end + + # Test overlong 4-byte + for byt in 0x80:0x8f + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) + end + + # Test 4-byte > 0x10ffff + for byt in 0x90:0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) + end + for byt in 0xf5:0xf7 + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) + end + + # Test 5-byte + for byt in 0xf8:0xfb + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) + end + + # Test 6-byte + for byt in 0xfc:0xfd + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) + end + + # Test 7-byte + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) + + # Three and above byte sequences + for byt in 0xe0:0xef + # Lead followed by only 1 continuation byte + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80])) + # Lead ended by non-continuation character < 0x80 + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0])) + # Lead ended by non-continuation character > 0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) + end + + # 3-byte encoded surrogate character(s) + # Single surrogate + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) + # Not followed by surrogate + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) + # Trailing surrogate first + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) + # Followed by lead surrogate + @test_throws ArgumentError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) + + # Four byte sequences + for byt in 0xf0:0xf4 + # Lead followed by only 2 continuation bytes + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80])) + # Lead followed by non-continuation character < 0x80 + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws ArgumentError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) + end + catch exp ; + println("Error checking $T: $byt") + throw(exp) + end +end From 982cfbc1c626850e73eaa859a89f9b40848c979c Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sat, 30 May 2015 14:28:58 +0200 Subject: [PATCH 2/3] Make changes based on various reviewers good advice --- base/utf.jl | 291 +++++++++++++++++++++++++--------------------------- 1 file changed, 139 insertions(+), 152 deletions(-) diff --git a/base/utf.jl b/base/utf.jl index b170973406b43..1e70d8a58fc79 100644 --- a/base/utf.jl +++ b/base/utf.jl @@ -23,10 +23,10 @@ const errMsgs = [ "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)", "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)", "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)", - "not a leading Unicode surrogate character at index <<1>> (0x<<2>>)", - "not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)", - "not a valid Unicode surrogate character at index <<1>> (0x<<2>>", - "missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)", + "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)", + "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)", + "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>", + "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)", "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)", "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)", "UTF16String data must be NULL-terminated", @@ -87,7 +87,7 @@ const empty_utf32 = UTF32String(UInt32[0]) is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) -is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800) +is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800) is_valid_continuation(c) = ((c & 0xc0) == 0x80) function length(s::UTF16String) @@ -105,14 +105,14 @@ function endof(s::UTF16String) d = s.data i = length(d) - 1 i == 0 && return i - return is_surrogate_char(d[i]) ? i-1 : i + return is_surrogate_codepoint(d[i]) ? i-1 : i end get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) function next(s::UTF16String, i::Int) ch = s.data[i] - !is_surrogate_char(ch) && return (Char(ch), i+1) + !is_surrogate_codepoint(ch) && return (Char(ch), i+1) # check length, account for terminating \0 i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)) !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch) @@ -160,15 +160,14 @@ const UTF_UNICODE4 = 16 # non-BMP characters present const UTF_SURROGATE = 32 # surrogate pairs present # Get a UTF-8 continuation byte, give error if invalid, and update position and character value -@inline function get_continuation(ch::UInt32, str, pos) - byt::UInt8 = str[pos += 1] +@inline function get_continuation(ch::UInt32, byt::UInt8, pos) !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt) - (ch << 6) | (byt & 0x3f), pos + (ch << 6) | (byt & 0x3f) end #= @doc """ -@brief Validates and calculates number of characters in a string +@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8 @param[in] str Vector of UInt8 @param[in] options flags to determine error handling (default 0) @@ -178,9 +177,9 @@ end """ -> =# function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) - local byt::UInt8 - local ch::UInt32, surr::UInt32 - local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 + local byt::UInt8, ch::UInt32, surr::UInt32 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 pos = 0 len = sizeof(dat) @inbounds while pos < len @@ -191,7 +190,7 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) if ch < 0xe0 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch, pos = get_continuation(ch & 0x3f, dat, pos) + ch = get_continuation(ch & 0x3f, dat[pos += 1], pos) if ch > 0x7f num2byte += 1 flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 @@ -205,16 +204,17 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) elseif ch < 0xf0 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch, pos = get_continuation(ch & 0x0f, dat, pos) - ch, pos = get_continuation(ch, dat, pos) + ch = get_continuation(ch & 0x0f, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) # check for surrogate pairs, make sure correct - if is_surrogate_char(ch) + if is_surrogate_codepoint(ch) !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) # next character *must* be a trailing surrogate character (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) - byt = dat[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) - surr, pos = get_continuation(0x0000d, dat, pos) - surr, pos = get_continuation(surr, dat, pos) + byt = dat[pos += 1] + (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) + surr = get_continuation(0x0000d, dat[pos += 1], pos) + surr = get_continuation(surr, dat[pos += 1], pos) !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) flags |= UTF_SURROGATE @@ -230,14 +230,14 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) elseif ch < 0xf5 # 4-byte UTF-8 sequence (i.e. characters > 0xffff) (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch, pos = get_continuation(ch & 0x07, dat, pos) - ch, pos = get_continuation(ch, dat, pos) - ch, pos = get_continuation(ch, dat, pos) + ch = get_continuation(ch & 0x07, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) if ch > 0x10ffff utf_errfunc(UTF_ERR_INVALID, pos-3, ch) elseif ch > 0xffff num4byte += 1 - elseif is_surrogate_char(ch) + elseif is_surrogate_codepoint(ch) utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) elseif (options & UTF_ACCEPT_LONG) != 0 # This is an overly long encode character @@ -255,12 +255,14 @@ function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) end end end - totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte end #= @doc """ -@brief Validates and calculates number of characters in a UTF-16 string +@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16 @param[in] dat Vector{UInt16} @param[in] options flags to determine error handling (default 0) @@ -271,8 +273,9 @@ end =# function check_string_utf16(dat::Vector{UInt16}, len::Int) local ch::UInt32 - local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 - local pos = 0 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = 0 @inbounds while pos < len ch = dat[pos += 1] totalchar += 1 @@ -283,7 +286,7 @@ function check_string_utf16(dat::Vector{UInt16}, len::Int) elseif ch < 0x800 num2byte += 1 flags |= UTF_UNICODE2 - elseif !is_surrogate_char(ch) + elseif !is_surrogate_codepoint(ch) num3byte += 1 elseif is_surrogate_lead(ch) pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) @@ -296,12 +299,14 @@ function check_string_utf16(dat::Vector{UInt16}, len::Int) end end end - totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte end #= @doc """ -@brief Validates and calculates number of characters in a UTF-32 string +@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32 @param[in] dat Vector{UInt32} @param[in] options flags to determine error handling (default 0) @@ -312,8 +317,9 @@ end =# function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) local ch::UInt32 - local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 - local pos = 0 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = 0 @inbounds while pos < len ch = dat[pos += 1] totalchar += 1 @@ -327,7 +333,7 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) elseif ch > 0xffff (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) num4byte += 1 - elseif !is_surrogate_char(ch) + elseif !is_surrogate_codepoint(ch) num3byte += 1 elseif is_surrogate_lead(ch) pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) @@ -342,14 +348,17 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) end end end - totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte end function check_string_abs(str::AbstractString, options::Integer=0) local ch::UInt32 - local totalchar=0, num2byte=0, num3byte=0, num4byte=0, flags::UInt=0 - local pos = start(str) - local len = endof(str) + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = start(str) + len = endof(str) @inbounds while pos < len ch, pos = next(str, pos) totalchar += 1 @@ -363,7 +372,7 @@ function check_string_abs(str::AbstractString, options::Integer=0) elseif ch > 0xffff (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) num4byte += 1 - elseif !is_surrogate_char(ch) + elseif !is_surrogate_codepoint(ch) num3byte += 1 elseif is_surrogate_lead(ch) pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) @@ -378,84 +387,39 @@ function check_string_abs(str::AbstractString, options::Integer=0) end end end - totalchar, flags | (num3byte == 0 ? 0 : UTF_UNICODE3) | (num4byte == 0 ? 0 : UTF_UNICODE4), num4byte, num3byte, num2byte + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte end # Quickly copy and set trailing \0 -macro return_fast_utf_copy(T1, T2, len, dat) - quote - @inbounds return $(esc(T1))(setindex!(copy!(Vector{$(esc(T2))}($(esc(len))), $(esc(dat))), 0, $(esc(len)))) - end +@inline function fast_utf_copy(T::Type{UInt16}, len, dat) + @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len)) end - -# Get rest of character ch from 2-byte UTF-8 sequence in str, update pos and return character -macro get_utf8_2!(str, pos, ch) - quote - (($(esc(ch)) & 0x1f) << 6) | ($(esc(str))[$(esc(pos)) += 1] & 0x3f) - end +@inline function fast_utf_copy(T::Type{Char}, len, dat) + @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len)) end -# Get rest of character ch from 3-byte UTF-8 sequence in str, update pos and return character -macro get_utf8_3!(str, pos, ch) - quote - ($(esc(pos)) += 2 ; - (($(esc(ch)) & 0xf) << 12) - | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) - | ($(esc(str))[$(esc(pos))] & 0x3f)) - end +# Get rest of character ch from 3-byte UTF-8 sequence in str +@inline function get_utf8_3(dat, pos, ch) + @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) end -# Get rest of character ch from 4-byte UTF-8 sequence in str, update pos and return character -macro get_utf8_4!(str, pos, ch) - quote - ($(esc(pos)) += 3 ; - (($(esc(ch)) & 0x7) << 18) - | (UInt32($(esc(str))[$(esc(pos))-2] & 0x3f) << 12) - | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) - | ($(esc(str))[$(esc(pos))] & 0x3f)) - end +# Get rest of character ch from 4-byte UTF-8 sequence in dat, update pos and return character +@inline function get_utf8_4(dat, pos, ch) + @inbounds return (((ch & 0x7) << 18) + | (UInt32(dat[pos-2] & 0x3f) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) end -# Get the trailing surrogate character in UTF-8 from an array, update the position -macro get_utf8_surr!(str, pos) - quote - ($(esc(pos)) += 3 ; - ((UInt32($(esc(str))[$(esc(pos))-2] & 0xf) << 12) - | (UInt32($(esc(str))[$(esc(pos))-1] & 0x3f) << 6) - | ($(esc(str))[$(esc(pos))] & 0x3f))) - end -end - -# Output a character as a 2-byte UTF-8 sequence, update the position -macro output_utf8_2!(buf, out, ch) - quote - $(esc(buf))[$(esc(out)) += 1] = 0xc0 | ($(esc(ch)) >>> 6) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) - end -end -# Output a character as a 3-byte UTF-8 sequence, update the position -macro output_utf8_3!(buf, out, ch) - quote - $(esc(buf))[$(esc(out)) += 1] = 0xe0 | (($(esc(ch)) >>> 12) & 0x3f) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) - end -end # Output a character as a 4-byte UTF-8 sequence, update the position -macro output_utf8_4!(buf, out, ch) - quote - $(esc(buf))[$(esc(out)) += 1] = 0xf0 | ($(esc(ch)) >>> 18) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 12) & 0x3f) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | (($(esc(ch)) >>> 6) & 0x3f) - $(esc(buf))[$(esc(out)) += 1] = 0x80 | ($(esc(ch)) & 0x3f) - end -end - -# Output a UTF-16 surrogate pair, update the position -macro output_utf16_surr!(buf, out, ch) - quote - $(esc(buf))[$(esc(out)) += 1] = UInt16(0xd7c0 + ($(esc(ch)) >>> 10)) - $(esc(buf))[$(esc(out)) += 1] = UInt16(0xdc00 + ($(esc(ch)) & 0x3ff)) +@inline function output_utf8_4(buf, out, ch) + @inbounds begin + buf[out + 1] = 0xf0 | (ch >>> 18) + buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) + buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out + 4] = 0x80 | (ch & 0x3f) end end @@ -475,11 +439,13 @@ function convert(::Type{UTF16String}, str::AbstractString) buf = Vector{UInt16}(len+num4byte+1) out = 0 @inbounds for ch in str - c = reinterpret(UInt32, ch) + c = UInt32(ch) if c < 0x10000 buf[out += 1] = UInt16(c) else - @output_utf16_surr!(buf, out, c) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) end end @inbounds buf[out + 1] = 0 # NULL termination @@ -508,9 +474,10 @@ end #= @doc """ -@brief Converts a UTF-8 encoded string to UTF-16 encoding +@brief Converts a UTF8String to a UTF16String -@param[in] str::Vector{UInt8} +@param[in] ::Type{UTF16String} +@param[in] str::UTF8String @return ::UTF16String @throws ArgumentError @@ -524,11 +491,11 @@ function convert(::Type{UTF16String}, str::UTF8String) len, flags, num4byte = check_string_utf8(dat) len += num4byte buf = Vector{UInt16}(len+1) - buf[len+1] = 0 + @inbounds buf[len+1] = 0 # Optimize case where no characters > 0x7f flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) - out::UInt = 0 - pos::UInt = 0 + out = 0 + pos = 0 @inbounds while out < len ch::UInt32 = dat[pos += 1] # Handle ASCII characters @@ -536,14 +503,18 @@ function convert(::Type{UTF16String}, str::UTF8String) buf[out += 1] = ch # Handle range 0x80-0x7ff elseif ch < 0xe0 - buf[out += 1] = @get_utf8_2!(dat, pos, ch) + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) # Handle range 0x800-0xffff elseif ch < 0xf0 - buf[out += 1] = @get_utf8_3!(dat, pos, ch) + pos += 2 + buf[out += 1] = get_utf8_3(dat, pos, ch) # Handle range 0x10000-0x10ffff else - ch = @get_utf8_4!(dat, pos, ch) - @output_utf16_surr!(buf, out, ch) + pos += 3 + ch = get_utf8_4(dat, pos, ch) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) end end UTF16String(buf) @@ -551,9 +522,10 @@ end #= @doc """ -@brief Reencodes a UTF-16 or UTF-32 encoded string using UTF-8 encoding +@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String -@param[in] str::Union(Vector{UInt16}, Vector{UInt32}) +@param[in] ::Type{UTF8String} +@param[in] dat::Vector{UInt16} @return ::UTF8String @throws ArgumentError @@ -573,6 +545,7 @@ end @doc """ @brief Converts a UTF16String to a UTF8String +@param[in] ::Type{UTF8String} @param[in] str::UTF16String @return ::UTF8String @@ -592,8 +565,9 @@ end #= @doc """ -@brief Encodes a vector of UInt32 to a UTF8String +@brief Encodes a UTF-32 encoded vector of UInt32 to a UTF8String +@param[in] ::Type{UTF8String} @param[in] dat::Vector{UInt32} @return ::UTF8String @@ -614,6 +588,7 @@ end @doc """ @brief Converts a UTF32String to a UTF8String +@param[in] ::Type{UTF8String} @param[in] str::UTF32String @return ::UTF8String @@ -633,7 +608,7 @@ end #= @doc """ -@brief Encodes an already validated vector of UInt16 or UInt32 as UTF-8 +@brief Converts an already validated vector of UInt16 or UInt32 to a UTF8String @param[in] T type (UInt16 or UInt32) @param[in] dat Vector{T} @@ -644,8 +619,8 @@ end =# function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) buf = Vector{UInt8}(len) - out::UInt = 0 - pos::UInt = 0 + out = 0 + pos = 0 @inbounds while out < len ch::UInt32 = dat[pos += 1] # Handle ASCII characters @@ -653,17 +628,21 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) buf[out += 1] = ch # Handle 0x80-0x7ff elseif ch < 0x800 - @output_utf8_2!(buf, out, ch) + buf[out += 1] = 0xc0 | (ch >>> 6) + buf[out += 1] = 0x80 | (ch & 0x3f) # Handle 0x10000-0x10ffff (if input is UInt32) elseif T == UInt32 && ch > 0xffff - @output_utf8_4!(buf, out, ch) + output_utf8_4(buf, out, ch) + out += 4 # Handle surrogate pairs - elseif is_surrogate_char(ch) - ch = get_supplementary(ch, dat[pos += 1]) - @output_utf8_4!(buf, out, ch) + elseif is_surrogate_codepoint(ch) + output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1])) + out += 4 # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters else - @output_utf8_3!(buf, out, ch) + buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) + buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out += 1] = 0x80 | (ch & 0x3f) end end UTF8String(buf) @@ -671,9 +650,10 @@ end #= """ -@brief Converts a UTF-8 encoded string to UTF-32 encoding +@brief Converts a UTF8String to a UTF32String -@param[in] dat::Vector{UInt8} +@param[in] ::Type{UTF32String} +@param[in] str::UTF8String @return ::UTF32String @throws ArgumentError @@ -687,11 +667,11 @@ function convert(::Type{UTF32String}, str::UTF8String) len, flags = check_string_utf8(dat) # Optimize case where no characters > 0x7f totlen = len+1 - flags == 0 && @return_fast_utf_copy(UTF32String, Char, totlen, dat) + flags == 0 && return fast_utf_copy(Char, totlen, dat) # has multi-byte UTF-8 sequences buf = Vector{Char}(totlen) @inbounds buf[totlen] = 0 # NULL termination - local ch::UInt32 + local ch::UInt32, surr::UInt32 out = 0 pos = 0 @inbounds while out < len @@ -701,19 +681,25 @@ function convert(::Type{UTF32String}, str::UTF8String) buf[out += 1] = ch # Handle range 0x80-0x7ff elseif ch < 0xe0 - buf[out += 1] = @get_utf8_2!(dat, pos, ch) + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) # Handle range 0x800-0xffff elseif ch < 0xf0 - ch = @get_utf8_3!(dat, pos, ch) + pos += 2 + ch = get_utf8_3(dat, pos, ch) # Handle surrogate pairs (should have been encoded in 4 bytes) if is_surrogate_lead(ch) # Build up 32-bit character from ch and trailing surrogate in next 3 bytes - ch = get_supplementary(ch, @get_utf8_surr!(dat, pos)) + pos += 3 + surr = ((UInt32(dat[pos-2] & 0xf) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) + ch = get_supplementary(ch, surr) end buf[out += 1] = ch # Handle range 0x10000-0x10ffff else - buf[out += 1] = @get_utf8_4!(dat, pos, ch) + pos += 3 + buf[out += 1] = get_utf8_4(dat, pos, ch) end end UTF32String(buf) @@ -723,6 +709,7 @@ end """ @brief Converts a UTF16String to UTF32String +@param[in] ::Type{UTF32String} @param[in] str::UTF16String @return ::UTF32String @@ -753,8 +740,9 @@ end #= """ -@brief Converts a Vector of UInt32 to a UTF16String +@brief Converts a UTF-32 encoded vector of UInt32 to a UTF16String +@param[in] ::Type{UTF16String} @param[in] dat::Vector{UInt32} @return ::UTF16String @@ -769,7 +757,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32}) len, flags, num4byte = check_string_utf32(dat, len>>>2) len += num4byte + 1 # optimized path, no surrogates - num4byte == 0 && @return_fast_utf_copy(UTF16String, UInt16, len, dat) + num4byte == 0 && return fast_utf_copy(UInt16, len, dat) return encode_to_utf16(dat, len) end @@ -777,6 +765,7 @@ end """ @brief Converts a UTF32String to UTF16String +@param[in] ::Type{UTF16String} @param[in] str::UTF32String @return ::UTF16String @@ -797,10 +786,10 @@ end #= @doc """ -@brief Encodes an already validated Vector of UInt32 as UTF-16 +@brief Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String -@param[in] dat Vector{UInt32} -@param[in] len length of output in 16-bit words +@param[in] dat::Vector{UInt32} UTF-32 encoded data +@param[in] len length of output in 16-bit words @return ::UTF16String """ -> @@ -828,14 +817,12 @@ utf16(x) = convert(UTF16String, x) function convert(::Type{UTF16String}, str::ASCIIString) dat = str.data - len = length(dat)+1 - @return_fast_utf_copy(UTF16String, UInt16, len, dat) + fast_utf_copy(UInt16, length(dat)+1, dat) end function convert(::Type{UTF32String}, str::ASCIIString) dat = str.data - len = length(dat)+1 - @return_fast_utf_copy(UTF32String, Char, len, dat) + fast_utf_copy(Char, length(dat)+1, dat) end convert(::Type{UTF16String}, str::UTF16String) = str @@ -860,13 +847,13 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) @inbounds while i < n # check for unpaired surrogates if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) i += 2 - elseif is_surrogate_char(data[i]) + elseif is_surrogate_codepoint(data[i]) return false else i += 1 end end - return i > n || !is_surrogate_char(data[i]) + return i > n || !is_surrogate_codepoint(data[i]) end function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) @@ -931,8 +918,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char}) convert(T, takebuf_string(s)) end -convert(::Type{Array{Char,1}}, s::UTF32String) = s.data -convert(::Type{Array{Char}}, s::UTF32String) = s.data +convert(::Type{Vector{Char}}, str::UTF32String) = str.data +convert(::Type{Array{Char}}, str::UTF32String) = str.data reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) @@ -961,7 +948,7 @@ end function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) for i=1:length(str) - @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end + @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end end return true end From b159907373f507d1b77d3c2fb4de4cbb8f639a18 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Tue, 2 Jun 2015 12:40:51 +0200 Subject: [PATCH 3/3] Reorganize UTF handling files --- base/sysimg.jl | 7 +- base/utf16.jl | 72 ++++++ base/utf32.jl | 33 +++ base/utfcheck.jl | 255 +++++++++++++++++++ base/{utf.jl => utfconvert.jl} | 436 +-------------------------------- base/utferror.jl | 51 ++++ base/utftype.jl | 39 +++ 7 files changed, 457 insertions(+), 436 deletions(-) create mode 100644 base/utf16.jl create mode 100644 base/utf32.jl create mode 100644 base/utfcheck.jl rename base/{utf.jl => utfconvert.jl} (51%) create mode 100644 base/utferror.jl create mode 100644 base/utftype.jl diff --git a/base/sysimg.jl b/base/sysimg.jl index b44340b0216a9..a8a6b1fb706bb 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -84,10 +84,15 @@ include("iterator.jl") include("osutils.jl") # strings & printing +include("utferror.jl") +include("utftype.jl") +include("utfcheck.jl") include("char.jl") include("ascii.jl") include("utf8.jl") -include("utf.jl") +include("utf16.jl") +include("utf32.jl") +include("utfconvert.jl") include("iobuffer.jl") include("string.jl") include("utf8proc.jl") diff --git a/base/utf16.jl b/base/utf16.jl new file mode 100644 index 0000000000000..dd3358f36dd9c --- /dev/null +++ b/base/utf16.jl @@ -0,0 +1,72 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +function length(s::UTF16String) + d = s.data + len = length(d) - 1 + len == 0 && return 0 + cnum = 0 + for i = 1:len + @inbounds cnum += !is_surrogate_trail(d[i]) + end + cnum +end + +function endof(s::UTF16String) + d = s.data + i = length(d) - 1 + i == 0 && return i + return is_surrogate_codepoint(d[i]) ? i-1 : i +end + +get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) + +function next(s::UTF16String, i::Int) + ch = s.data[i] + !is_surrogate_codepoint(ch) && return (Char(ch), i+1) + # check length, account for terminating \0 + i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)) + !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch) + ct = s.data[i+1] + !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch) + Char(get_supplementary(ch, ct)), i+2 +end + +function reverseind(s::UTF16String, i::Integer) + j = length(s.data) - i + return is_surrogate_trail(s.data[j]) ? j-1 : j +end + +lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator + +function reverse(s::UTF16String) + d = s.data + out = similar(d) + out[end] = 0 # NULL termination + n = length(d) + @inbounds for i = 1:n-1 + ch = d[n-i] + if is_surrogate_lead(ch) + out[i],out[i-1] = out[i-1],ch + else + out[i] = ch + end + end + UTF16String(out) +end + +sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) + +function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) + i = 1 + n = length(data) # this may include NULL termination; that's okay + @inbounds while i < n # check for unpaired surrogates + if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) + i += 2 + elseif is_surrogate_codepoint(data[i]) + return false + else + i += 1 + end + end + return i > n || !is_surrogate_codepoint(data[i]) +end diff --git a/base/utf32.jl b/base/utf32.jl new file mode 100644 index 0000000000000..444b4d1bab4fe --- /dev/null +++ b/base/utf32.jl @@ -0,0 +1,33 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# UTF-32 basic functions +next(s::UTF32String, i::Int) = (s.data[i], i+1) +endof(s::UTF32String) = length(s.data) - 1 +length(s::UTF32String) = length(s.data) - 1 + +reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) + +sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) + +function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) + for i=1:length(str) + @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end + end + return true +end +isvalid(str::Vector{Char}) = isvalid(UTF32String, str) + +function map(f, s::UTF32String) + d = s.data + out = similar(d) + out[end] = 0 + + @inbounds for i = 1:(length(d)-1) + c2 = f(d[i]) + if !isa(c2, Char) + throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) + end + out[i] = (c2::Char) + end + UTF32String(out) +end diff --git a/base/utfcheck.jl b/base/utfcheck.jl new file mode 100644 index 0000000000000..083ca2f09f299 --- /dev/null +++ b/base/utfcheck.jl @@ -0,0 +1,255 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings, +# and also to return information necessary to convert to other encodings + +is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) +is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) +is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800) +is_valid_continuation(c) = ((c & 0xc0) == 0x80) + +# Options for check_string_* functions + +const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' +const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 +const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) + +const UTF_LONG = 1 # Long encodings are present +const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present +const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present +const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff +const UTF_UNICODE4 = 16 # non-BMP characters present +const UTF_SURROGATE = 32 # surrogate pairs present + +# Get a UTF-8 continuation byte, give error if invalid, and update position and character value +@inline function get_continuation(ch::UInt32, byt::UInt8, pos) + !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt) + (ch << 6) | (byt & 0x3f) +end + +#= +@doc """ +@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8 + +@param[in] str Vector of UInt8 +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) + local byt::UInt8, ch::UInt32, surr::UInt32 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = 0 + len = sizeof(dat) + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + # Check UTF-8 encoding + if ch < 0xe0 + # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) + (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch = get_continuation(ch & 0x3f, dat[pos += 1], pos) + if ch > 0x7f + num2byte += 1 + flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 + elseif (options & UTF_ACCEPT_LONG) != 0 + flags |= UTF_LONG + elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) + flags |= UTF_LONG + else + utf_errfunc(UTF_ERR_LONG, pos, ch) + end + elseif ch < 0xf0 + # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) + (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch = get_continuation(ch & 0x0f, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) + # check for surrogate pairs, make sure correct + if is_surrogate_codepoint(ch) + !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) + # next character *must* be a trailing surrogate character + (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) + byt = dat[pos += 1] + (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) + surr = get_continuation(0x0000d, dat[pos += 1], pos) + surr = get_continuation(surr, dat[pos += 1], pos) + !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) + flags |= UTF_SURROGATE + num4byte += 1 + elseif ch > 0x07ff + num3byte += 1 + elseif (options & UTF_ACCEPT_LONG) != 0 + flags |= UTF_LONG + num2byte += 1 + else + utf_errfunc(UTF_ERR_LONG, pos-2, ch) + end + elseif ch < 0xf5 + # 4-byte UTF-8 sequence (i.e. characters > 0xffff) + (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) + ch = get_continuation(ch & 0x07, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) + ch = get_continuation(ch, dat[pos += 1], pos) + if ch > 0x10ffff + utf_errfunc(UTF_ERR_INVALID, pos-3, ch) + elseif ch > 0xffff + num4byte += 1 + elseif is_surrogate_codepoint(ch) + utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) + elseif (options & UTF_ACCEPT_LONG) != 0 + # This is an overly long encode character + flags |= UTF_LONG + if ch > 0x7ff + num3byte += 1 + elseif ch > 0x7f + num2byte += 1 + end + else + utf_errfunc(UTF_ERR_LONG, pos-2, ch) + end + else + utf_errfunc(UTF_ERR_INVALID, pos, ch) + end + end + end + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte +end + +#= +@doc """ +@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16 + +@param[in] dat Vector{UInt16} +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf16(dat::Vector{UInt16}, len::Int) + local ch::UInt32 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = 0 + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif !is_surrogate_codepoint(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch = dat[pos += 1] + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte +end + +#= +@doc """ +@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32 + +@param[in] dat Vector{UInt32} +@param[in] options flags to determine error handling (default 0) + +@return (total characters, flags, 4-byte, 3-byte, 2-byte) +@throws ArgumentError +""" -> +=# +function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) + local ch::UInt32 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = 0 + @inbounds while pos < len + ch = dat[pos += 1] + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif ch > 0xffff + (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) + num4byte += 1 + elseif !is_surrogate_codepoint(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch = dat[pos += 1] + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) + flags |= UTF_SURROGATE + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte +end + +function check_string_abs(str::AbstractString, options::Integer=0) + local ch::UInt32 + flags::UInt = 0 + totalchar = num2byte = num3byte = num4byte = 0 + pos = start(str) + len = endof(str) + @inbounds while pos < len + ch, pos = next(str, pos) + totalchar += 1 + if ch > 0x7f + if ch < 0x100 + num2byte += 1 + flags |= UTF_LATIN1 + elseif ch < 0x800 + num2byte += 1 + flags |= UTF_UNICODE2 + elseif ch > 0xffff + (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) + num4byte += 1 + elseif !is_surrogate_codepoint(ch) + num3byte += 1 + elseif is_surrogate_lead(ch) + pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) + # next character *must* be a trailing surrogate character + ch, pos = next(str, pos) + !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) + num4byte += 1 + (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) + flags |= UTF_SURROGATE + else + utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) + end + end + end + num3byte != 0 && (flags |= UTF_UNICODE3) + num4byte != 0 && (flags |= UTF_UNICODE4) + return totalchar, flags, num4byte, num3byte, num2byte +end diff --git a/base/utf.jl b/base/utfconvert.jl similarity index 51% rename from base/utf.jl rename to base/utfconvert.jl index 1e70d8a58fc79..60b08a8da7c9e 100644 --- a/base/utf.jl +++ b/base/utfconvert.jl @@ -1,396 +1,6 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -#= -@doc """ -@brief Error messages for Unicode / UTF support -""" -> -=# - -const UTF_ERR_SHORT = 1 -const UTF_ERR_CONT = 2 -const UTF_ERR_LONG = 3 -const UTF_ERR_NOT_LEAD = 4 -const UTF_ERR_NOT_TRAIL = 5 -const UTF_ERR_NOT_SURROGATE = 6 -const UTF_ERR_MISSING_SURROGATE = 7 -const UTF_ERR_INVALID = 8 -const UTF_ERR_SURROGATE = 9 -const UTF_ERR_NULL_16_TERMINATE = 10 -const UTF_ERR_NULL_32_TERMINATE = 11 -const UTF_ERR_MAX = 11 - -const errMsgs = [ - "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)", - "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)", - "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)", - "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)", - "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)", - "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>", - "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)", - "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)", - "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)", - "UTF16String data must be NULL-terminated", - "UTF32String data must be NULL-terminated" -] -#= -@doc """ -@brief Throws ArgumentError with information about the specific error, location, and character - -@param[in] errcode Error code for Unicode error (one of UTF_ERR_*) -@param[in] charpos Index of invalid byte or character -@param[in] invchar Invalid byte or character - -@throws never returns, always throws ArgumentError -""" -> -=# -function utf_errfunc(errcode::Integer, charpos, invchar) - if errcode < 1 || errcode > UTF_ERR_MAX - throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar")) - end - throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar)))) -end - -#= -@doc """ -@brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order -""" -> -=# -immutable UTF16String <: AbstractString - data::Vector{UInt16} # includes 16-bit NULL termination after string chars - function UTF16String(data::Vector{UInt16}) - if length(data) < 1 || data[end] != 0 - utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0) - end - new(data) - end -end - -#= -@doc """ -@brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order -""" -> -=# -immutable UTF32String <: DirectIndexString - data::Vector{Char} # includes 32-bit NULL termination after string chars - - function UTF32String(data::Vector{Char}) - if length(data) < 1 || data[end] != Char(0) - utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0) - end - new(data) - end -end -UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) - -const empty_utf16 = UTF16String(UInt16[0]) -const empty_utf32 = UTF32String(UInt32[0]) - -is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) -is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) -is_surrogate_codepoint(c::Unsigned) = ((c & ~0x007ff) == 0xd800) -is_valid_continuation(c) = ((c & 0xc0) == 0x80) - -function length(s::UTF16String) - d = s.data - len = length(d) - 1 - len == 0 && return 0 - cnum = 0 - for i = 1:len - @inbounds cnum += !is_surrogate_trail(d[i]) - end - cnum -end - -function endof(s::UTF16String) - d = s.data - i = length(d) - 1 - i == 0 && return i - return is_surrogate_codepoint(d[i]) ? i-1 : i -end - -get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) - -function next(s::UTF16String, i::Int) - ch = s.data[i] - !is_surrogate_codepoint(ch) && return (Char(ch), i+1) - # check length, account for terminating \0 - i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)) - !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch) - ct = s.data[i+1] - !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch) - Char(get_supplementary(ch, ct)), i+2 -end - -function reverseind(s::UTF16String, i::Integer) - j = length(s.data) - i - return is_surrogate_trail(s.data[j]) ? j-1 : j -end - -lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator - -function reverse(s::UTF16String) - d = s.data - out = similar(d) - out[end] = 0 # NULL termination - n = length(d) - @inbounds for i = 1:n-1 - ch = d[n-i] - if is_surrogate_lead(ch) - out[i],out[i-1] = out[i-1],ch - else - out[i] = ch - end - end - UTF16String(out) -end - -next(s::UTF32String, i::Int) = (s.data[i], i+1) -endof(s::UTF32String) = length(s.data) - 1 -length(s::UTF32String) = length(s.data) - 1 - -const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' -const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 -const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) - -const UTF_LONG = 1 # Long encodings are present -const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present -const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present -const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff -const UTF_UNICODE4 = 16 # non-BMP characters present -const UTF_SURROGATE = 32 # surrogate pairs present - -# Get a UTF-8 continuation byte, give error if invalid, and update position and character value -@inline function get_continuation(ch::UInt32, byt::UInt8, pos) - !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt) - (ch << 6) | (byt & 0x3f) -end - -#= -@doc """ -@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8 - -@param[in] str Vector of UInt8 -@param[in] options flags to determine error handling (default 0) - -@return (total characters, flags, 4-byte, 3-byte, 2-byte) -@throws ArgumentError -""" -> -=# -function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) - local byt::UInt8, ch::UInt32, surr::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - pos = 0 - len = sizeof(dat) - @inbounds while pos < len - ch = dat[pos += 1] - totalchar += 1 - if ch > 0x7f - # Check UTF-8 encoding - if ch < 0xe0 - # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) - (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch = get_continuation(ch & 0x3f, dat[pos += 1], pos) - if ch > 0x7f - num2byte += 1 - flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 - elseif (options & UTF_ACCEPT_LONG) != 0 - flags |= UTF_LONG - elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) - flags |= UTF_LONG - else - utf_errfunc(UTF_ERR_LONG, pos, ch) - end - elseif ch < 0xf0 - # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) - (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch = get_continuation(ch & 0x0f, dat[pos += 1], pos) - ch = get_continuation(ch, dat[pos += 1], pos) - # check for surrogate pairs, make sure correct - if is_surrogate_codepoint(ch) - !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) - # next character *must* be a trailing surrogate character - (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) - byt = dat[pos += 1] - (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) - surr = get_continuation(0x0000d, dat[pos += 1], pos) - surr = get_continuation(surr, dat[pos += 1], pos) - !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) - (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) - flags |= UTF_SURROGATE - num4byte += 1 - elseif ch > 0x07ff - num3byte += 1 - elseif (options & UTF_ACCEPT_LONG) != 0 - flags |= UTF_LONG - num2byte += 1 - else - utf_errfunc(UTF_ERR_LONG, pos-2, ch) - end - elseif ch < 0xf5 - # 4-byte UTF-8 sequence (i.e. characters > 0xffff) - (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) - ch = get_continuation(ch & 0x07, dat[pos += 1], pos) - ch = get_continuation(ch, dat[pos += 1], pos) - ch = get_continuation(ch, dat[pos += 1], pos) - if ch > 0x10ffff - utf_errfunc(UTF_ERR_INVALID, pos-3, ch) - elseif ch > 0xffff - num4byte += 1 - elseif is_surrogate_codepoint(ch) - utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) - elseif (options & UTF_ACCEPT_LONG) != 0 - # This is an overly long encode character - flags |= UTF_LONG - if ch > 0x7ff - num3byte += 1 - elseif ch > 0x7f - num2byte += 1 - end - else - utf_errfunc(UTF_ERR_LONG, pos-2, ch) - end - else - utf_errfunc(UTF_ERR_INVALID, pos, ch) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end - -#= -@doc """ -@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16 - -@param[in] dat Vector{UInt16} -@param[in] options flags to determine error handling (default 0) - -@return (total characters, flags, 4-byte, 3-byte, 2-byte) -@throws ArgumentError -""" -> -=# -function check_string_utf16(dat::Vector{UInt16}, len::Int) - local ch::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - pos = 0 - @inbounds while pos < len - ch = dat[pos += 1] - totalchar += 1 - if ch > 0x7f - if ch < 0x100 - num2byte += 1 - flags |= UTF_LATIN1 - elseif ch < 0x800 - num2byte += 1 - flags |= UTF_UNICODE2 - elseif !is_surrogate_codepoint(ch) - num3byte += 1 - elseif is_surrogate_lead(ch) - pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) - # next character *must* be a trailing surrogate character - ch = dat[pos += 1] - !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) - num4byte += 1 - else - utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end - -#= -@doc """ -@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32 - -@param[in] dat Vector{UInt32} -@param[in] options flags to determine error handling (default 0) - -@return (total characters, flags, 4-byte, 3-byte, 2-byte) -@throws ArgumentError -""" -> -=# -function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) - local ch::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - pos = 0 - @inbounds while pos < len - ch = dat[pos += 1] - totalchar += 1 - if ch > 0x7f - if ch < 0x100 - num2byte += 1 - flags |= UTF_LATIN1 - elseif ch < 0x800 - num2byte += 1 - flags |= UTF_UNICODE2 - elseif ch > 0xffff - (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) - num4byte += 1 - elseif !is_surrogate_codepoint(ch) - num3byte += 1 - elseif is_surrogate_lead(ch) - pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) - # next character *must* be a trailing surrogate character - ch = dat[pos += 1] - !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) - num4byte += 1 - (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) - flags |= UTF_SURROGATE - else - utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end - -function check_string_abs(str::AbstractString, options::Integer=0) - local ch::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - pos = start(str) - len = endof(str) - @inbounds while pos < len - ch, pos = next(str, pos) - totalchar += 1 - if ch > 0x7f - if ch < 0x100 - num2byte += 1 - flags |= UTF_LATIN1 - elseif ch < 0x800 - num2byte += 1 - flags |= UTF_UNICODE2 - elseif ch > 0xffff - (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) - num4byte += 1 - elseif !is_surrogate_codepoint(ch) - num3byte += 1 - elseif is_surrogate_lead(ch) - pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) - # next character *must* be a trailing surrogate character - ch, pos = next(str, pos) - !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) - num4byte += 1 - (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) - flags |= UTF_SURROGATE - else - utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end +# Functions to convert to different UTF encodings # Quickly copy and set trailing \0 @inline function fast_utf_copy(T::Type{UInt16}, len, dat) @@ -837,25 +447,9 @@ convert(::Type{UTF32String}, str::UTF32String) = str convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) -sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) = convert(Ptr{T}, pointer(s)) -function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) - i = 1 - n = length(data) # this may include NULL termination; that's okay - @inbounds while i < n # check for unpaired surrogates - if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) - i += 2 - elseif is_surrogate_codepoint(data[i]) - return false - else - i += 1 - end - end - return i > n || !is_surrogate_codepoint(data[i]) -end - function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) len = length(data) @@ -921,9 +515,6 @@ end convert(::Type{Vector{Char}}, str::UTF32String) = str.data convert(::Type{Array{Char}}, str::UTF32String) = str.data -reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) - -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) = convert(Ptr{T}, pointer(s)) @@ -946,16 +537,6 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) UTF32String(d) end -function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) - for i=1:length(str) - @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end - end - return true -end -isvalid(str::Vector{Char}) = isvalid(UTF32String, str) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) - utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32})) @@ -963,18 +544,3 @@ function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32})) while unsafe_load(p, len+1) != 0; len += 1; end utf32(p, len) end - -function map(f, s::UTF32String) - d = s.data - out = similar(d) - out[end] = 0 - - @inbounds for i = 1:(length(d)-1) - c2 = f(d[i]) - if !isa(c2, Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) - end - out[i] = (c2::Char) - end - UTF32String(out) -end diff --git a/base/utferror.jl b/base/utferror.jl new file mode 100644 index 0000000000000..796a12187dd3c --- /dev/null +++ b/base/utferror.jl @@ -0,0 +1,51 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +#= +@doc """ +@brief Error messages for Unicode / UTF support +""" -> +=# + +const UTF_ERR_SHORT = 1 +const UTF_ERR_CONT = 2 +const UTF_ERR_LONG = 3 +const UTF_ERR_NOT_LEAD = 4 +const UTF_ERR_NOT_TRAIL = 5 +const UTF_ERR_NOT_SURROGATE = 6 +const UTF_ERR_MISSING_SURROGATE = 7 +const UTF_ERR_INVALID = 8 +const UTF_ERR_SURROGATE = 9 +const UTF_ERR_NULL_16_TERMINATE = 10 +const UTF_ERR_NULL_32_TERMINATE = 11 +const UTF_ERR_MAX = 11 + +const errMsgs = [ + "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)", + "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)", + "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)", + "not a leading Unicode surrogate codepoint at index <<1>> (0x<<2>>)", + "not a trailing Unicode surrogate codepoint at index <<1>> (0x<<2>>)", + "not a valid Unicode surrogate codepoint at index <<1>> (0x<<2>>", + "missing trailing Unicode surrogate codepoint after index <<1>> (0x<<2>>)", + "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)", + "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)", + "UTF16String data must be NULL-terminated", + "UTF32String data must be NULL-terminated" +] +#= +@doc """ +@brief Throws ArgumentError with information about the specific error, location, and character + +@param[in] errcode Error code for Unicode error (one of UTF_ERR_*) +@param[in] charpos Index of invalid byte or character +@param[in] invchar Invalid byte or character + +@throws never returns, always throws ArgumentError +""" -> +=# +function utf_errfunc(errcode::Integer, charpos, invchar) + if errcode < 1 || errcode > UTF_ERR_MAX + throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar")) + end + throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar)))) +end diff --git a/base/utftype.jl b/base/utftype.jl new file mode 100644 index 0000000000000..019e496b1cb46 --- /dev/null +++ b/base/utftype.jl @@ -0,0 +1,39 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +#= +@doc """ +@brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order +""" -> +=# +immutable UTF16String <: AbstractString + data::Vector{UInt16} # includes 16-bit NULL termination after string chars + function UTF16String(data::Vector{UInt16}) + if length(data) < 1 || data[end] != 0 + utf_errfunc(UTF_ERR_NULL_16_TERMINATE, 0, 0) + end + new(data) + end +end + +#= +@doc """ +@brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order +""" -> +=# +immutable UTF32String <: DirectIndexString + data::Vector{Char} # includes 32-bit NULL termination after string chars + + function UTF32String(data::Vector{Char}) + if length(data) < 1 || data[end] != Char(0) + utf_errfunc(UTF_ERR_NULL_32_TERMINATE, 0, 0) + end + new(data) + end +end +UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) + +const empty_utf16 = UTF16String(UInt16[0]) +const empty_utf32 = UTF32String(UInt32[0]) + +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)