From 59dba6ff67885ab64f603c9d8086d0a8fe585949 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Wed, 12 Sep 2018 14:23:30 -0400
Subject: [PATCH 1/2] base/char.jl: tweak doc strings

---
 base/char.jl | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index bec67add95b71..749d561762b2c 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -2,7 +2,7 @@
 
 """
 The `AbstractChar` type is the supertype of all character implementations
-in Julia.   A character represents a Unicode code point, and can be converted
+in Julia. A character represents a Unicode code point, and can be converted
 to an integer via the [`codepoint`](@ref) function in order to obtain the
 numerical value of the code point, or constructed from the same integer.
 These numerical values determine how characters are compared with `<` and `==`,
@@ -11,7 +11,7 @@ method and a `T(::UInt32)` constructor, at minimum.
 
 A given `AbstractChar` subtype may be capable of representing only a subset
 of Unicode, in which case conversion from an unsupported `UInt32` value
-may throw an error.  Conversely, the built-in [`Char`](@ref) type represents
+may throw an error. Conversely, the built-in [`Char`](@ref) type represents
 a *superset* of Unicode (in order to losslessly encode invalid byte streams),
 in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
 The [`isvalid`](@ref) function can be used to check which codepoints are
@@ -34,7 +34,7 @@ AbstractChar
     Char(c::Union{Number,AbstractChar})
 
 `Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
-of characters in Julia.  `Char` is the type used for character literals like `'x'`
+of characters in Julia. `Char` is the type used for character literals like `'x'`
 and it is also the element type of [`String`](@ref).
 
 In order to losslessly represent arbitrary byte streams stored in a `String`,
@@ -50,18 +50,18 @@ Char
 (::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
 (::Type{T})(x::T) where {T<:AbstractChar} = x
 
-codepoint(c::Char) = UInt32(c)
-
 """
-    codepoint(c::AbstractChar)
+    codepoint(c::AbstractChar) -> Integer
 
 Return the Unicode codepoint (an unsigned integer) corresponding
 to the character `c` (or throw an exception if `c` does not represent
-a valid character).   For `Char`, this is a `UInt32` value, but
+a valid character). For `Char`, this is a `UInt32` value, but
 `AbstractChar` types that represent only a subset of Unicode may
 return a different-sized integer (e.g. `UInt8`).
 """
-codepoint # defined for Char in boot.jl
+function codepoint end
+
+codepoint(c::Char) = UInt32(c)
 
 struct InvalidCharError{T<:AbstractChar} <: Exception
     char::T
@@ -91,7 +91,7 @@ end
 #           not to support malformed or overlong encodings.
 
 """
-    ismalformed(c::AbstractChar)
+    ismalformed(c::AbstractChar) -> Bool
 
 Return `true` if `c` represents malformed (non-Unicode) data according to the
 encoding used by `c`.  Defaults to `false` for non-`Char` types.  See also
@@ -100,9 +100,9 @@ encoding used by `c`.  Defaults to `false` for non-`Char` types.  See also
 ismalformed(c::AbstractChar) = false
 
 """
-    isoverlong(c::AbstractChar)
+    isoverlong(c::AbstractChar) -> Bool
 
-Return `true` if `c` represents an overlong UTF-8 sequence.  Defaults
+Return `true` if `c` represents an overlong UTF-8 sequence. Defaults
 to `false` for non-`Char` types.  See also [`decode_overlong`](@ref)
 and [`show_invalid`](@ref).
 """
@@ -123,6 +123,15 @@ function UInt32(c::Char)
     ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
 end
 
+"""
+    decode_overlong(c::AbstractChar) -> Integer
+
+When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
+the Unicode codepoint value of `c`. `AbstractChar` implementations
+that support overlong encodings should implement `Base.decode_overlong`.
+"""
+function decode_overlong end
+
 function decode_overlong(c::Char)
     u = reinterpret(UInt32, c)
     l1 = leading_ones(u)
@@ -133,15 +142,6 @@ function decode_overlong(c::Char)
     ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
 end
 
-"""
-    decode_overlong(c::AbstractChar)
-
-When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
-the Unicode codepoint value of `c`.   `AbstractChar` implementations
-that support overlong encodings should implement `Base.decode_overlong`.
-"""
-decode_overlong
-
 function Char(u::UInt32)
     u < 0x80 && return reinterpret(Char, u << 24)
     u < 0x00200000 || code_point_err(u)::Union{}
@@ -270,7 +270,7 @@ function show(io::IO, c::AbstractChar)
         write(io, 0x27)
     else # unprintable, well-formed, non-overlong Unicode
         u = codepoint(c)
-        write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
+        write(io, 0x27, 0x5c, u <= 0x7f ? 0x78 : u <= 0xffff ? 0x75 : 0x55)
         d = max(2, 8 - (leading_zeros(u) >> 2))
         while 0 < d
             write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])

From d4d577e21f7c6c3c75fef75d3d0bdd1cea251330 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Wed, 12 Sep 2018 14:25:02 -0400
Subject: [PATCH 2/2] define ncodeunits(c::Char) as fast equivalent of
 ncodeunits(string(c))

There was a non-public `codelen(c::Char)` method which previously did
this. This also replaces internal uses of this with `ncodeunits(c)`.
---
 base/char.jl              | 10 +++++++++-
 base/io.jl                |  2 +-
 base/strings/substring.jl |  4 ++--
 test/char.jl              | 23 +++++++++++++++++++++++
 test/iostream.jl          |  4 ++--
 5 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 749d561762b2c..95c3d624b5c1a 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -50,6 +50,15 @@ Char
 (::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
 (::Type{T})(x::T) where {T<:AbstractChar} = x
 
+"""
+    ncodeunits(c::Char) -> Int
+
+Return the number of code units required to encode a character as UTF-8.
+This is the number of bytes which will be printed if the character is written
+to an output stream, or `ncodeunits(string(c))` but computed efficiently.
+"""
+ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient
+
 """
     codepoint(c::AbstractChar) -> Integer
 
@@ -197,7 +206,6 @@ hash(x::Char, h::UInt) =
     hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
 
 first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
-codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3)
 
 # fallbacks:
 isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
diff --git a/base/io.jl b/base/io.jl
index 06bb38645ada4..187a8b6e0c42e 100644
--- a/base/io.jl
+++ b/base/io.jl
@@ -974,7 +974,7 @@ function skipchars(predicate, io::IO; linecomment=nothing)
         if c === linecomment
             readline(io)
         elseif !predicate(c)
-            skip(io, -codelen(c))
+            skip(io, -ncodeunits(c))
             break
         end
     end
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index 9b9e0d1e7424b..005e1de8eb2c2 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -149,7 +149,7 @@ function string(a::Union{Char, String, SubString{String}}...)
     n = 0
     for v in a
         if v isa Char
-            n += codelen(v)
+            n += ncodeunits(v)
         else
             n += sizeof(v)
         end
@@ -159,7 +159,7 @@ function string(a::Union{Char, String, SubString{String}}...)
     for v in a
         if v isa Char
            x = bswap(reinterpret(UInt32, v))
-           for j in 1:codelen(v)
+           for j in 1:ncodeunits(v)
                unsafe_store!(pointer(out, offs), x % UInt8)
                offs += 1
                x >>= 8
diff --git a/test/char.jl b/test/char.jl
index 0793ba583b94f..64f9a80a1f047 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c)
     @test_throws MethodError write(IOBuffer(), ASCIIChar('x'))
     @test_throws MethodError read(IOBuffer('x'), ASCIIChar)
 end
+
+@testset "ncodeunits(::Char)" begin
+    # valid encodings
+    @test ncodeunits('\0')       == 1
+    @test ncodeunits('\x1')      == 1
+    @test ncodeunits('\x7f')     == 1
+    @test ncodeunits('\u80')     == 2
+    @test ncodeunits('\uff')     == 2
+    @test ncodeunits('\u7ff')    == 2
+    @test ncodeunits('\u800')    == 3
+    @test ncodeunits('\uffff')   == 3
+    @test ncodeunits('\U10000')  == 4
+    @test ncodeunits('\U10ffff') == 4
+    # invalid encodings
+    @test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1
+    @test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1
+    @test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2
+    @test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2
+    @test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3
+    @test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3
+    @test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4
+    @test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4
+end
diff --git a/test/iostream.jl b/test/iostream.jl
index 08f68af7bb253..66b19d63c0191 100644
--- a/test/iostream.jl
+++ b/test/iostream.jl
@@ -30,9 +30,9 @@
         @test read(file, Char) == 'n'
 
         # test it correctly handles unicode
-        for (byte,char) in zip(1:4, ('@','߷','࿊','𐋺'))
+        for (byte, char) in zip(1:4, ('@','߷','࿊','𐋺'))
             append_to_file("abcdef$char")
-            @test Base.codelen(char) == byte
+            @test ncodeunits(char) == byte
             @test !eof(skipchars(isletter, file))
             @test read(file, Char) == char
         end