From 59dba6ff67885ab64f603c9d8086d0a8fe585949 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 12 Sep 2018 14:23:30 -0400 Subject: [PATCH 1/2] base/char.jl: tweak doc strings --- base/char.jl | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/base/char.jl b/base/char.jl index bec67add95b71..749d561762b2c 100644 --- a/base/char.jl +++ b/base/char.jl @@ -2,7 +2,7 @@ """ The `AbstractChar` type is the supertype of all character implementations -in Julia. A character represents a Unicode code point, and can be converted +in Julia. A character represents a Unicode code point, and can be converted to an integer via the [`codepoint`](@ref) function in order to obtain the numerical value of the code point, or constructed from the same integer. These numerical values determine how characters are compared with `<` and `==`, @@ -11,7 +11,7 @@ method and a `T(::UInt32)` constructor, at minimum. A given `AbstractChar` subtype may be capable of representing only a subset of Unicode, in which case conversion from an unsupported `UInt32` value -may throw an error. Conversely, the built-in [`Char`](@ref) type represents +may throw an error. Conversely, the built-in [`Char`](@ref) type represents a *superset* of Unicode (in order to losslessly encode invalid byte streams), in which case conversion of a non-Unicode value *to* `UInt32` throws an error. The [`isvalid`](@ref) function can be used to check which codepoints are @@ -34,7 +34,7 @@ AbstractChar Char(c::Union{Number,AbstractChar}) `Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation -of characters in Julia. `Char` is the type used for character literals like `'x'` +of characters in Julia. `Char` is the type used for character literals like `'x'` and it is also the element type of [`String`](@ref). In order to losslessly represent arbitrary byte streams stored in a `String`, @@ -50,18 +50,18 @@ Char (::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x)) (::Type{T})(x::T) where {T<:AbstractChar} = x -codepoint(c::Char) = UInt32(c) - """ - codepoint(c::AbstractChar) + codepoint(c::AbstractChar) -> Integer Return the Unicode codepoint (an unsigned integer) corresponding to the character `c` (or throw an exception if `c` does not represent -a valid character). For `Char`, this is a `UInt32` value, but +a valid character). For `Char`, this is a `UInt32` value, but `AbstractChar` types that represent only a subset of Unicode may return a different-sized integer (e.g. `UInt8`). """ -codepoint # defined for Char in boot.jl +function codepoint end + +codepoint(c::Char) = UInt32(c) struct InvalidCharError{T<:AbstractChar} <: Exception char::T @@ -91,7 +91,7 @@ end # not to support malformed or overlong encodings. """ - ismalformed(c::AbstractChar) + ismalformed(c::AbstractChar) -> Bool Return `true` if `c` represents malformed (non-Unicode) data according to the encoding used by `c`. Defaults to `false` for non-`Char` types. See also @@ -100,9 +100,9 @@ encoding used by `c`. Defaults to `false` for non-`Char` types. See also ismalformed(c::AbstractChar) = false """ - isoverlong(c::AbstractChar) + isoverlong(c::AbstractChar) -> Bool -Return `true` if `c` represents an overlong UTF-8 sequence. Defaults +Return `true` if `c` represents an overlong UTF-8 sequence. Defaults to `false` for non-`Char` types. See also [`decode_overlong`](@ref) and [`show_invalid`](@ref). """ @@ -123,6 +123,15 @@ function UInt32(c::Char) ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6) end +""" + decode_overlong(c::AbstractChar) -> Integer + +When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns +the Unicode codepoint value of `c`. `AbstractChar` implementations +that support overlong encodings should implement `Base.decode_overlong`. +""" +function decode_overlong end + function decode_overlong(c::Char) u = reinterpret(UInt32, c) l1 = leading_ones(u) @@ -133,15 +142,6 @@ function decode_overlong(c::Char) ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6) end -""" - decode_overlong(c::AbstractChar) - -When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns -the Unicode codepoint value of `c`. `AbstractChar` implementations -that support overlong encodings should implement `Base.decode_overlong`. -""" -decode_overlong - function Char(u::UInt32) u < 0x80 && return reinterpret(Char, u << 24) u < 0x00200000 || code_point_err(u)::Union{} @@ -270,7 +270,7 @@ function show(io::IO, c::AbstractChar) write(io, 0x27) else # unprintable, well-formed, non-overlong Unicode u = codepoint(c) - write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55) + write(io, 0x27, 0x5c, u <= 0x7f ? 0x78 : u <= 0xffff ? 0x75 : 0x55) d = max(2, 8 - (leading_zeros(u) >> 2)) while 0 < d write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1]) From d4d577e21f7c6c3c75fef75d3d0bdd1cea251330 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 12 Sep 2018 14:25:02 -0400 Subject: [PATCH 2/2] define ncodeunits(c::Char) as fast equivalent of ncodeunits(string(c)) There was a non-public `codelen(c::Char)` method which previously did this. This also replaces internal uses of this with `ncodeunits(c)`. --- base/char.jl | 10 +++++++++- base/io.jl | 2 +- base/strings/substring.jl | 4 ++-- test/char.jl | 23 +++++++++++++++++++++++ test/iostream.jl | 4 ++-- 5 files changed, 37 insertions(+), 6 deletions(-) diff --git a/base/char.jl b/base/char.jl index 749d561762b2c..95c3d624b5c1a 100644 --- a/base/char.jl +++ b/base/char.jl @@ -50,6 +50,15 @@ Char (::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x)) (::Type{T})(x::T) where {T<:AbstractChar} = x +""" + ncodeunits(c::Char) -> Int + +Return the number of code units required to encode a character as UTF-8. +This is the number of bytes which will be printed if the character is written +to an output stream, or `ncodeunits(string(c))` but computed efficiently. +""" +ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient + """ codepoint(c::AbstractChar) -> Integer @@ -197,7 +206,6 @@ hash(x::Char, h::UInt) = hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8 -codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3) # fallbacks: isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y)) diff --git a/base/io.jl b/base/io.jl index 06bb38645ada4..187a8b6e0c42e 100644 --- a/base/io.jl +++ b/base/io.jl @@ -974,7 +974,7 @@ function skipchars(predicate, io::IO; linecomment=nothing) if c === linecomment readline(io) elseif !predicate(c) - skip(io, -codelen(c)) + skip(io, -ncodeunits(c)) break end end diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 9b9e0d1e7424b..005e1de8eb2c2 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -149,7 +149,7 @@ function string(a::Union{Char, String, SubString{String}}...) n = 0 for v in a if v isa Char - n += codelen(v) + n += ncodeunits(v) else n += sizeof(v) end @@ -159,7 +159,7 @@ function string(a::Union{Char, String, SubString{String}}...) for v in a if v isa Char x = bswap(reinterpret(UInt32, v)) - for j in 1:codelen(v) + for j in 1:ncodeunits(v) unsafe_store!(pointer(out, offs), x % UInt8) offs += 1 x >>= 8 diff --git a/test/char.jl b/test/char.jl index 0793ba583b94f..64f9a80a1f047 100644 --- a/test/char.jl +++ b/test/char.jl @@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c) @test_throws MethodError write(IOBuffer(), ASCIIChar('x')) @test_throws MethodError read(IOBuffer('x'), ASCIIChar) end + +@testset "ncodeunits(::Char)" begin + # valid encodings + @test ncodeunits('\0') == 1 + @test ncodeunits('\x1') == 1 + @test ncodeunits('\x7f') == 1 + @test ncodeunits('\u80') == 2 + @test ncodeunits('\uff') == 2 + @test ncodeunits('\u7ff') == 2 + @test ncodeunits('\u800') == 3 + @test ncodeunits('\uffff') == 3 + @test ncodeunits('\U10000') == 4 + @test ncodeunits('\U10ffff') == 4 + # invalid encodings + @test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1 + @test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1 + @test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2 + @test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2 + @test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3 + @test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3 + @test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4 + @test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4 +end diff --git a/test/iostream.jl b/test/iostream.jl index 08f68af7bb253..66b19d63c0191 100644 --- a/test/iostream.jl +++ b/test/iostream.jl @@ -30,9 +30,9 @@ @test read(file, Char) == 'n' # test it correctly handles unicode - for (byte,char) in zip(1:4, ('@','߷','࿊','𐋺')) + for (byte, char) in zip(1:4, ('@','߷','࿊','𐋺')) append_to_file("abcdef$char") - @test Base.codelen(char) == byte + @test ncodeunits(char) == byte @test !eof(skipchars(isletter, file)) @test read(file, Char) == char end