Skip to content

Commit

Permalink
Add isvalid(Type, value) methods, to replace is_valid_*
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed May 19, 2015
1 parent ca2ca31 commit 81dba36
Show file tree
Hide file tree
Showing 12 changed files with 139 additions and 69 deletions.
2 changes: 1 addition & 1 deletion base/ascii.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ ascii(x) = convert(ASCIIString, x)
convert(::Type{ASCIIString}, s::ASCIIString) = s
convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data)
convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin
is_valid_ascii(a) || throw(ArgumentError("invalid ASCII sequence"))
isvalid(ASCIIString,a) || throw(ArgumentError("invalid ASCII sequence"))
return ASCIIString(a)
end

Expand Down
13 changes: 13 additions & 0 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -443,3 +443,16 @@ export float32_isvalid, float64_isvalid
@deprecate (&)(x::Char, y::Char) Char(UInt32(x) & UInt32(y))
@deprecate (|)(x::Char, y::Char) Char(UInt32(x) | UInt32(y))
@deprecate ($)(x::Char, y::Char) Char(UInt32(x) $ UInt32(y))

# 11241

@deprecate is_valid_char(ch::Char) isvalid(ch)
@deprecate is_valid_char(ch::Union(Unsigned, Integer)) isvalid(Char, ch)
@deprecate is_valid_ascii(str::ASCIIString) isvalid(str)
@deprecate is_valid_ascii(str::Union(AbstractArray{UInt8}, UTF8String)) isvalid(ASCIIString, str)
@deprecate is_valid_utf8(str::UTF8String) isvalid(str)
@deprecate is_valid_utf8(str::Union(AbstractArray{UInt8}, ASCIIString)) isvalid(UTF8String, str)
@deprecate is_valid_utf16(str::UTF16String) isvalid(str)
@deprecate is_valid_utf16(str::AbstractArray{UInt16}) isvalid(UTF16String, str)
@deprecate is_valid_utf32(str::UTF32String) isvalid(str)
@deprecate is_valid_utf32(str::AbstractArray{UInt32}) isvalid(UTF32String, str)
5 changes: 0 additions & 5 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -820,11 +820,6 @@ export
ind2chr,
info,
is_assigned_char,
is_valid_ascii,
is_valid_char,
is_valid_utf8,
is_valid_utf16,
is_valid_utf32,
isalnum,
isalpha,
isascii,
Expand Down
2 changes: 1 addition & 1 deletion base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ end

function readall(s::IO)
b = readbytes(s)
return is_valid_ascii(b) ? ASCIIString(b) : UTF8String(b)
return isvalid(ASCIIString, b) ? ASCIIString(b) : UTF8String(b)
end
readall(filename::AbstractString) = open(readall, filename)

Expand Down
4 changes: 2 additions & 2 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -968,8 +968,8 @@ byte_string_classify(s::ByteString) = byte_string_classify(s.data)
# 1: valid ASCII
# 2: valid UTF-8

is_valid_ascii(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
is_valid_utf8(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
isvalid(::Type{ASCIIString}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
isvalid(::Type{UTF8String}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0

## multiline strings ##

Expand Down
9 changes: 4 additions & 5 deletions base/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

function is_valid_utf16(data::AbstractArray{UInt16})
isvalid(::Type{UTF16String}, str::UTF16String) = isvalid(UTF16String, str.data)
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
while i < n # check for unpaired surrogates
Expand All @@ -110,10 +111,8 @@ function is_valid_utf16(data::AbstractArray{UInt16})
return i > n || !utf16_is_surrogate(data[i])
end

is_valid_utf16(s::UTF16String) = is_valid_utf16(s.data)

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!is_valid_utf16(data) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
Expand Down Expand Up @@ -144,7 +143,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
copy!(d,1, data,1, length(data)) # assume native byte order
end
d[end] = 0 # NULL terminate
!is_valid_utf16(d) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
UTF16String(d)
end

Expand Down
9 changes: 5 additions & 4 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,14 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
UTF32String(d)
end

function is_valid_utf32(s::Union(Vector{Char}, Vector{UInt32}))
for i=1:length(s)
@inbounds if !is_valid_char(reinterpret(UInt32, s[i])) ; return false ; end
function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
for i=1:length(str)
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
end
return true
end
is_valid_utf32(s::UTF32String) = is_valid_utf32(s.data)
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
Expand Down
2 changes: 1 addition & 1 deletion base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = is_valid_utf8(a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
l = length(a)
idx = 1
Expand Down
12 changes: 7 additions & 5 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,21 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid

export isgraphemebreak

# also exported by Base:
export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth,
export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
is_valid_char(ch::Unsigned) = !Bool((ch-0xd800<0x800)|(ch>0x10ffff))
is_valid_char(ch::Integer) = is_valid_char(Unsigned(ch))
is_valid_char(ch::Char) = is_valid_char(UInt32(ch))
isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))

isvalid(ch::Char) = isvalid(Char, ch)

# utf8 category constants
const UTF8PROC_CATEGORY_CN = 0
Expand Down
4 changes: 2 additions & 2 deletions doc/manual/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ convert an integer value back to a :obj:`Char` just as easily:
Not all integer values are valid Unicode code points, but for
performance, the :func:`Char` conversion does not check that every character
value is valid. If you want to check that each converted value is a
valid code point, use the :func:`is_valid_char` function:
valid code point, use the :func:`isvalid` function:

.. doctest::

julia> Char(0x110000)
'\U110000'

julia> is_valid_char(0x110000)
julia> isvalid(Char, 0x110000)
false

As of this writing, the valid Unicode code points are ``U+00`` through
Expand Down
22 changes: 10 additions & 12 deletions doc/stdlib/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,19 @@
even though they may contain more than one codepoint; for example
a letter combined with an accent mark is a single grapheme.)

.. function:: is_valid_ascii(s) -> Bool
.. function:: isvalid(value) -> Bool

Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
Returns true if the given value is valid for its type,
which currently can be one of ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``

.. function:: is_valid_utf8(s) -> Bool
.. function:: isvalid(T, value) -> Bool

Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid UTF-8, false otherwise.

.. function:: is_valid_char(c) -> Bool

Returns true if the given char or integer is a valid Unicode code point.
Returns true if the given value is valid for that type.
Types currently can be ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``
Values for ``Char`` can be of type ``Char`` or ``UInt32``
Values for ``ASCIIString`` and ``UTF8String`` can be of that type, or ``Vector{UInt8}``
Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}``
Values for ``UTF32String`` can be ``UTF32String``, ``Vector{Char}`` or ``Vector{UInt32}``

.. function:: is_assigned_char(c) -> Bool

Expand Down Expand Up @@ -379,10 +381,6 @@

Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated.

.. function:: is_valid_utf16(s) -> Bool

Returns true if the argument (``UTF16String`` or ``UInt16`` array) is valid UTF-16.

.. function:: utf32(s)

Create a UTF-32 string from a byte array, array of ``UInt32``, or
Expand Down
124 changes: 93 additions & 31 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1281,56 +1281,118 @@ end
@test isxdigit("a") == true
@test isxdigit("g") == false

@test is_valid_ascii("is_valid_ascii") == true
@test is_valid_ascii("Σ_not_valid_ascii") == false
@test is_valid_char('a') == true
@test is_valid_char('\x00') == true
@test is_valid_char(0xd800) == false

@test is_valid_utf16(utf16("a")) == true
@test is_valid_utf16(UInt16[0xd800,0]) == false
# TODO is_valid_utf8

# Issue #11140
@test is_valid_utf32(utf32("a")) == true
@test is_valid_utf32(utf32("\x00")) == true
@test is_valid_utf32(UInt32[0xd800,0]) == false
@test isvalid(utf32("a")) == true
@test isvalid(utf32("\x00")) == true
@test isvalid(UTF32String, UInt32[0xd800,0]) == false

# Issue #11241

@test isvalid(ASCIIString, "is_valid_ascii") == true
@test isvalid(ASCIIString, "Σ_not_valid_ascii") == false

# test all edge conditions
for (val, pass) in (
(0, true), (0xd7ff, true),
(0xd800, false), (0xdfff, false),
(0xe000, true), (0xffff, true),
(0x10000, true), (0x10ffff, true),
(0x110000, false)
)
@test isvalid(Char, val) == pass
end
for (val, pass) in (
(b"\x00", true),
(b"\x7f", true),
(b"\x80", false),
(b"\xbf", false),
(b"\xc0", false),
(b"\xff", false),
(b"\xc0\x80", false),
(b"\xc1\x80", false),
(b"\xc2\x80", true),
(b"\xc2\xc0", false),
(b"\xed\x9f\xbf", true),
(b"\xed\xa0\x80", false),
(b"\xed\xbf\xbf", false),
(b"\xee\x80\x80", true),
(b"\xef\xbf\xbf", true),
(b"\xf0\x90\x80\x80", true),
(b"\xf4\x8f\xbf\xbf", true),
(b"\xf4\x90\x80\x80", false),
(b"\xf5\x80\x80\x80", false),
(b"\ud800\udc00", false),
(b"\udbff\udfff", false),
(b"\ud800\u0100", false),
(b"\udc00\u0100", false),
(b"\udc00\ud800", false)
)
@test isvalid(UTF8String, val) == pass
end
for (val, pass) in (
(UInt16[0x0000], true),
(UInt16[0xd7ff,0], true),
(UInt16[0xd800,0], false),
(UInt16[0xdfff,0], false),
(UInt16[0xe000,0], true),
(UInt16[0xffff,0], true),
(UInt16[0xd800,0xdc00,0], true),
(UInt16[0xdbff,0xdfff,0], true),
(UInt16[0xd800,0x0100,0], false),
(UInt16[0xdc00,0x0100,0], false),
(UInt16[0xdc00,0xd800,0], false)
)
@test isvalid(UTF16String, val) == pass
end
for (val, pass) in (
(UInt32[0x0000], true),
(UInt32[0xd7ff,0], true),
(UInt32[0xd800,0], false),
(UInt32[0xdfff,0], false),
(UInt32[0xe000,0], true),
(UInt32[0xffff,0], true),
(UInt32[0x100000,0], true),
(UInt32[0x10ffff,0], true),
(UInt32[0x110000,0], false),
)
@test isvalid(UTF32String, val) == pass
end

# Issue #11203
@test is_valid_ascii(UInt8[]) == true
@test is_valid_utf8(UInt8[]) == true
@test is_valid_utf16(UInt16[]) == true
@test is_valid_utf32(UInt32[]) == true
@test isvalid(ASCIIString,UInt8[]) == true
@test isvalid(UTF8String, UInt8[]) == true
@test isvalid(UTF16String,UInt16[]) == true
@test isvalid(UTF32String,UInt32[]) == true

# Check UTF-8 characters
# Check ASCII range (true),
# then single continuation bytes and lead bytes with no following continuation bytes (false)
for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false))
for byt in rng
@test is_valid_utf8(UInt8[byt]) == flg
@test isvalid(UTF8String, UInt8[byt]) == flg
end
end
# Check overlong lead bytes for 2-character sequences (false)
for byt = 0xc0:0xc1
@test is_valid_utf8(UInt8[byt,0x80]) == false
@test isvalid(UTF8String, UInt8[byt,0x80]) == false
end
# Check valid lead-in to two-byte sequences (true)
for byt = 0xc2:0xdf
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == flg
@test isvalid(UTF8String, UInt8[byt, cont]) == flg
end
end
end
# Check three-byte sequences
for r1 in (0xe0:0xec, 0xee:0xef)
for byt = r1
# Check for short sequence
@test is_valid_utf8(UInt8[byt]) == false
@test isvalid(UTF8String, UInt8[byt]) == false
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg
@test isvalid(UTF8String, UInt8[byt, cont]) == false
@test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == flg
end
end
end
Expand All @@ -1339,8 +1401,8 @@ end
# Check for short sequence, or start of surrogate pair
for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[0xed, cont]) == false
@test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg
@test isvalid(UTF8String, UInt8[0xed, cont]) == false
@test isvalid(UTF8String, UInt8[0xed, cont, 0x80]) == flg
end
end
# Check valid four-byte sequences
Expand All @@ -1354,22 +1416,22 @@ for byt = 0xf0:0xf4
end
for (rng,flg) in r0
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg
@test isvalid(UTF8String, UInt8[byt, cont]) == false
@test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == false
@test isvalid(UTF8String, UInt8[byt, cont, 0x80, 0x80]) == flg
end
end
end
# Check five-byte sequences, should be invalid
for byt = 0xf8:0xfb
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
@test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check six-byte sequences, should be invalid
for byt = 0xfc:0xfd
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
@test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check seven-byte sequences, should be invalid
@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
@test isvalid(UTF8String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false

# This caused JuliaLang/JSON.jl#82
@test first('\x00':'\x7f') === '\x00'
Expand Down

0 comments on commit 81dba36

Please sign in to comment.