diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 5d59b0fc3bff4..5dcc41b030473 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -6,7 +6,7 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, convert, isvalid, ismalformed, isoverlong, iterate, AnnotatedString, AnnotatedChar, annotated_chartransform, - @assume_effects, annotations + @assume_effects, annotations, is_overlong_enc # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff @@ -262,17 +262,15 @@ julia> textwidth('⛵') 2 ``` """ -function textwidth(c::AbstractChar) - ismalformed(c) && return 1 - i = codepoint(c) - i < 0x7f && return Int(i >= 0x20) # ASCII fast path - Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) -end +textwidth(c::AbstractChar) = textwidth(Char(c)::Char) function textwidth(c::Char) - b = bswap(reinterpret(UInt32, c)) # from isascii(c) + u = reinterpret(UInt32, c) + b = bswap(u) # from isascii(c) b < 0x7f && return Int(b >= 0x20) # ASCII fast path - ismalformed(c) && return 1 + # We can't know a priori how terminals will render invalid UTF8 chars, + # so we conservatively decide a width of 1. + (ismalformed(c) || is_overlong_enc(u)) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end diff --git a/test/strings/util.jl b/test/strings/util.jl index bb87881bbaa1d..9ced27ee3f8d0 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -8,6 +8,8 @@ SubStr(s) = SubString("abc$(s)de", firstindex(s) + 3, lastindex(s) + 3) @test textwidth(c^3) == w*3 @test w == @invoke textwidth(c::AbstractChar) end + @test textwidth('\xc0\xa0') == 1 # overlong + @test textwidth('\xf0\x80\x80') == 1 # malformed for i in 0x00:0x7f # test all ASCII chars (which have fast path) w = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) c = Char(i)