diff --git a/base/ascii.jl b/base/ascii.jl index da70742cc61d6..940f957f8e948 100644 --- a/base/ascii.jl +++ b/base/ascii.jl @@ -100,7 +100,7 @@ ascii(x) = convert(ASCIIString, x) convert(::Type{ASCIIString}, s::ASCIIString) = s convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data) convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin - is_valid_ascii(a) || throw(ArgumentError("invalid ASCII sequence")) + isvalid(ASCIIString,a) || throw(ArgumentError("invalid ASCII sequence")) return ASCIIString(a) end diff --git a/base/deprecated.jl b/base/deprecated.jl index 4d2c645230bb4..c10cbd3185041 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -443,3 +443,16 @@ export float32_isvalid, float64_isvalid @deprecate (&)(x::Char, y::Char) Char(UInt32(x) & UInt32(y)) @deprecate (|)(x::Char, y::Char) Char(UInt32(x) | UInt32(y)) @deprecate ($)(x::Char, y::Char) Char(UInt32(x) $ UInt32(y)) + +# 11241 + +@deprecate is_valid_char(ch::Char) isvalid(ch) +@deprecate is_valid_char(ch::Union(Unsigned, Integer)) isvalid(Char, ch) +@deprecate is_valid_ascii(str::ASCIIString) isvalid(str) +@deprecate is_valid_ascii(str::Union(AbstractArray{UInt8}, UTF8String)) isvalid(ASCIIString, str) +@deprecate is_valid_utf8(str::UTF8String) isvalid(str) +@deprecate is_valid_utf8(str::Union(AbstractArray{UInt8}, ASCIIString)) isvalid(UTF8String, str) +@deprecate is_valid_utf16(str::UTF16String) isvalid(str) +@deprecate is_valid_utf16(str::AbstractArray{UInt16}) isvalid(UTF16String, str) +@deprecate is_valid_utf32(str::UTF32String) isvalid(str) +@deprecate is_valid_utf32(str::AbstractArray{UInt32}) isvalid(UTF32String, str) diff --git a/base/exports.jl b/base/exports.jl index c597161bd1d62..3ebf2be2733d2 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -820,11 +820,6 @@ export ind2chr, info, is_assigned_char, - is_valid_ascii, - is_valid_char, - is_valid_utf8, - is_valid_utf16, - is_valid_utf32, isalnum, isalpha, isascii, diff --git a/base/io.jl b/base/io.jl index c8023d7d77df3..7d0eb16d67c75 100644 --- a/base/io.jl +++ b/base/io.jl @@ -246,7 +246,7 @@ end function readall(s::IO) b = readbytes(s) - return is_valid_ascii(b) ? ASCIIString(b) : UTF8String(b) + return isvalid(ASCIIString, b) ? ASCIIString(b) : UTF8String(b) end readall(filename::AbstractString) = open(readall, filename) diff --git a/base/string.jl b/base/string.jl index 8a7e83f4e0cbd..7295a231eea89 100644 --- a/base/string.jl +++ b/base/string.jl @@ -968,8 +968,8 @@ byte_string_classify(s::ByteString) = byte_string_classify(s.data) # 1: valid ASCII # 2: valid UTF-8 -is_valid_ascii(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1 -is_valid_utf8(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0 +isvalid(::Type{ASCIIString}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1 +isvalid(::Type{UTF8String}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0 ## multiline strings ## diff --git a/base/utf16.jl b/base/utf16.jl index 2a6ec9b571167..2111a3f035f85 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -95,7 +95,8 @@ sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) = convert(Ptr{T}, pointer(s)) -function is_valid_utf16(data::AbstractArray{UInt16}) +isvalid(::Type{UTF16String}, str::UTF16String) = isvalid(UTF16String, str.data) +function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) i = 1 n = length(data) # this may include NULL termination; that's okay while i < n # check for unpaired surrogates @@ -110,10 +111,8 @@ function is_valid_utf16(data::AbstractArray{UInt16}) return i > n || !utf16_is_surrogate(data[i]) end -is_valid_utf16(s::UTF16String) = is_valid_utf16(s.data) - function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) - !is_valid_utf16(data) && throw(ArgumentError("invalid UTF16 data")) + !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) len = length(data) d = Array(UInt16, len + 1) d[end] = 0 # NULL terminate @@ -144,7 +143,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) copy!(d,1, data,1, length(data)) # assume native byte order end d[end] = 0 # NULL terminate - !is_valid_utf16(d) && throw(ArgumentError("invalid UTF16 data")) + !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data")) UTF16String(d) end diff --git a/base/utf32.jl b/base/utf32.jl index 5415b222888a7..306abcb24a95d 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -92,13 +92,14 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) UTF32String(d) end -function is_valid_utf32(s::Union(Vector{Char}, Vector{UInt32})) - for i=1:length(s) - @inbounds if !is_valid_char(reinterpret(UInt32, s[i])) ; return false ; end +function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) + for i=1:length(str) + @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end end return true end -is_valid_utf32(s::UTF32String) = is_valid_utf32(s.data) +isvalid(str::Vector{Char}) = isvalid(UTF32String, str) +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) diff --git a/base/utf8.jl b/base/utf8.jl index 07859349e1784..f288a1c0fa3dd 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data) utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) -convert(::Type{UTF8String}, a::Array{UInt8,1}) = is_valid_utf8(a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence")) +convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence")) function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) l = length(a) idx = 1 diff --git a/base/utf8proc.jl b/base/utf8proc.jl index 842a79429f059..fc60350f8545a 100644 --- a/base/utf8proc.jl +++ b/base/utf8proc.jl @@ -3,19 +3,21 @@ # Various Unicode functionality from the utf8proc library module UTF8proc -import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert +import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid export isgraphemebreak # also exported by Base: -export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth, +export normalize_string, graphemes, is_assigned_char, charwidth, isvalid, islower, isupper, isalpha, isdigit, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isblank # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff -is_valid_char(ch::Unsigned) = !Bool((ch-0xd800<0x800)|(ch>0x10ffff)) -is_valid_char(ch::Integer) = is_valid_char(Unsigned(ch)) -is_valid_char(ch::Char) = is_valid_char(UInt32(ch)) +isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff)) +isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch)) +isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch)) + +isvalid(ch::Char) = isvalid(Char, ch) # utf8 category constants const UTF8PROC_CATEGORY_CN = 0 diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 15fcfe8859e5b..8969095c609ce 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -99,14 +99,14 @@ convert an integer value back to a :obj:`Char` just as easily: Not all integer values are valid Unicode code points, but for performance, the :func:`Char` conversion does not check that every character value is valid. If you want to check that each converted value is a -valid code point, use the :func:`is_valid_char` function: +valid code point, use the :func:`isvalid` function: .. doctest:: julia> Char(0x110000) '\U110000' - julia> is_valid_char(0x110000) + julia> isvalid(Char, 0x110000) false As of this writing, the valid Unicode code points are ``U+00`` through diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index 4a88a4fadbf43..b05d50f3fb72f 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -109,17 +109,19 @@ even though they may contain more than one codepoint; for example a letter combined with an accent mark is a single grapheme.) -.. function:: is_valid_ascii(s) -> Bool +.. function:: isvalid(value) -> Bool - Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise. + Returns true if the given value is valid for its type, + which currently can be one of ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String`` -.. function:: is_valid_utf8(s) -> Bool +.. function:: isvalid(T, value) -> Bool - Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid UTF-8, false otherwise. - -.. function:: is_valid_char(c) -> Bool - - Returns true if the given char or integer is a valid Unicode code point. + Returns true if the given value is valid for that type. + Types currently can be ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String`` + Values for ``Char`` can be of type ``Char`` or ``UInt32`` + Values for ``ASCIIString`` and ``UTF8String`` can be of that type, or ``Vector{UInt8}`` + Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}`` + Values for ``UTF32String`` can be ``UTF32String``, ``Vector{Char}`` or ``Vector{UInt32}`` .. function:: is_assigned_char(c) -> Bool @@ -379,10 +381,6 @@ Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated. -.. function:: is_valid_utf16(s) -> Bool - - Returns true if the argument (``UTF16String`` or ``UInt16`` array) is valid UTF-16. - .. function:: utf32(s) Create a UTF-32 string from a byte array, array of ``UInt32``, or diff --git a/test/strings.jl b/test/strings.jl index 9e0ae6a0c5135..1f520aa7d4eaf 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1281,44 +1281,106 @@ end @test isxdigit("a") == true @test isxdigit("g") == false -@test is_valid_ascii("is_valid_ascii") == true -@test is_valid_ascii("Σ_not_valid_ascii") == false -@test is_valid_char('a') == true -@test is_valid_char('\x00') == true -@test is_valid_char(0xd800) == false - -@test is_valid_utf16(utf16("a")) == true -@test is_valid_utf16(UInt16[0xd800,0]) == false -# TODO is_valid_utf8 - # Issue #11140 -@test is_valid_utf32(utf32("a")) == true -@test is_valid_utf32(utf32("\x00")) == true -@test is_valid_utf32(UInt32[0xd800,0]) == false +@test isvalid(utf32("a")) == true +@test isvalid(utf32("\x00")) == true +@test isvalid(UTF32String, UInt32[0xd800,0]) == false + +# Issue #11241 + +@test isvalid(ASCIIString, "is_valid_ascii") == true +@test isvalid(ASCIIString, "Σ_not_valid_ascii") == false + +# test all edge conditions +for (val, pass) in ( + (0, true), (0xd7ff, true), + (0xd800, false), (0xdfff, false), + (0xe000, true), (0xffff, true), + (0x10000, true), (0x10ffff, true), + (0x110000, false) + ) + @test isvalid(Char, val) == pass +end +for (val, pass) in ( + (b"\x00", true), + (b"\x7f", true), + (b"\x80", false), + (b"\xbf", false), + (b"\xc0", false), + (b"\xff", false), + (b"\xc0\x80", false), + (b"\xc1\x80", false), + (b"\xc2\x80", true), + (b"\xc2\xc0", false), + (b"\xed\x9f\xbf", true), + (b"\xed\xa0\x80", false), + (b"\xed\xbf\xbf", false), + (b"\xee\x80\x80", true), + (b"\xef\xbf\xbf", true), + (b"\xf0\x90\x80\x80", true), + (b"\xf4\x8f\xbf\xbf", true), + (b"\xf4\x90\x80\x80", false), + (b"\xf5\x80\x80\x80", false), + (b"\ud800\udc00", false), + (b"\udbff\udfff", false), + (b"\ud800\u0100", false), + (b"\udc00\u0100", false), + (b"\udc00\ud800", false) + ) + @test isvalid(UTF8String, val) == pass +end +for (val, pass) in ( + (UInt16[0x0000], true), + (UInt16[0xd7ff,0], true), + (UInt16[0xd800,0], false), + (UInt16[0xdfff,0], false), + (UInt16[0xe000,0], true), + (UInt16[0xffff,0], true), + (UInt16[0xd800,0xdc00,0], true), + (UInt16[0xdbff,0xdfff,0], true), + (UInt16[0xd800,0x0100,0], false), + (UInt16[0xdc00,0x0100,0], false), + (UInt16[0xdc00,0xd800,0], false) + ) + @test isvalid(UTF16String, val) == pass +end +for (val, pass) in ( + (UInt32[0x0000], true), + (UInt32[0xd7ff,0], true), + (UInt32[0xd800,0], false), + (UInt32[0xdfff,0], false), + (UInt32[0xe000,0], true), + (UInt32[0xffff,0], true), + (UInt32[0x100000,0], true), + (UInt32[0x10ffff,0], true), + (UInt32[0x110000,0], false), + ) + @test isvalid(UTF32String, val) == pass +end # Issue #11203 -@test is_valid_ascii(UInt8[]) == true -@test is_valid_utf8(UInt8[]) == true -@test is_valid_utf16(UInt16[]) == true -@test is_valid_utf32(UInt32[]) == true +@test isvalid(ASCIIString,UInt8[]) == true +@test isvalid(UTF8String, UInt8[]) == true +@test isvalid(UTF16String,UInt16[]) == true +@test isvalid(UTF32String,UInt32[]) == true # Check UTF-8 characters # Check ASCII range (true), # then single continuation bytes and lead bytes with no following continuation bytes (false) for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false)) for byt in rng - @test is_valid_utf8(UInt8[byt]) == flg + @test isvalid(UTF8String, UInt8[byt]) == flg end end # Check overlong lead bytes for 2-character sequences (false) for byt = 0xc0:0xc1 - @test is_valid_utf8(UInt8[byt,0x80]) == false + @test isvalid(UTF8String, UInt8[byt,0x80]) == false end # Check valid lead-in to two-byte sequences (true) for byt = 0xc2:0xdf for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) for cont in rng - @test is_valid_utf8(UInt8[byt, cont]) == flg + @test isvalid(UTF8String, UInt8[byt, cont]) == flg end end end @@ -1326,11 +1388,11 @@ end for r1 in (0xe0:0xec, 0xee:0xef) for byt = r1 # Check for short sequence - @test is_valid_utf8(UInt8[byt]) == false + @test isvalid(UTF8String, UInt8[byt]) == false for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) for cont in rng - @test is_valid_utf8(UInt8[byt, cont]) == false - @test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg + @test isvalid(UTF8String, UInt8[byt, cont]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == flg end end end @@ -1339,8 +1401,8 @@ end # Check for short sequence, or start of surrogate pair for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false)) for cont in rng - @test is_valid_utf8(UInt8[0xed, cont]) == false - @test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg + @test isvalid(UTF8String, UInt8[0xed, cont]) == false + @test isvalid(UTF8String, UInt8[0xed, cont, 0x80]) == flg end end # Check valid four-byte sequences @@ -1354,22 +1416,22 @@ for byt = 0xf0:0xf4 end for (rng,flg) in r0 for cont in rng - @test is_valid_utf8(UInt8[byt, cont]) == false - @test is_valid_utf8(UInt8[byt, cont, 0x80]) == false - @test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg + @test isvalid(UTF8String, UInt8[byt, cont]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80, 0x80]) == flg end end end # Check five-byte sequences, should be invalid for byt = 0xf8:0xfb - @test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false + @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false end # Check six-byte sequences, should be invalid for byt = 0xfc:0xfd - @test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false + @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false end # Check seven-byte sequences, should be invalid -@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false +@test isvalid(UTF8String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false # This caused JuliaLang/JSON.jl#82 @test first('\x00':'\x7f') === '\x00'