From 2cd92fffb0a2c8f77648684a56cba0143042675e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 1 Jun 2025 14:14:41 +0200 Subject: [PATCH 1/2] Define textwidth for overlong chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, this would error. There is no guarantee of how terminals render overlong encodings. Some terminals does not print them at all, and some print "�". Here, we set a textwidth of 1, conservatively. --- base/strings/unicode.jl | 14 ++++++-------- test/strings/util.jl | 2 ++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 5d59b0fc3bff4..438ad1e23fc0b 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -262,17 +262,15 @@ julia> textwidth('⛵') 2 ``` """ -function textwidth(c::AbstractChar) - ismalformed(c) && return 1 - i = codepoint(c) - i < 0x7f && return Int(i >= 0x20) # ASCII fast path - Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) -end +textwidth(c::AbstractChar) = textwidth(Char(c)::Char) function textwidth(c::Char) - b = bswap(reinterpret(UInt32, c)) # from isascii(c) + u = reinterpret(UInt32, c) + b = bswap(u) # from isascii(c) b < 0x7f && return Int(b >= 0x20) # ASCII fast path - ismalformed(c) && return 1 + # We can't know a priori how terminals will render invalid UTF8 chars, + # so we conservatively decide a width of 1. + (ismalformed(c) || is_overlong_enc(u)) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end diff --git a/test/strings/util.jl b/test/strings/util.jl index bb87881bbaa1d..9ced27ee3f8d0 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -8,6 +8,8 @@ SubStr(s) = SubString("abc$(s)de", firstindex(s) + 3, lastindex(s) + 3) @test textwidth(c^3) == w*3 @test w == @invoke textwidth(c::AbstractChar) end + @test textwidth('\xc0\xa0') == 1 # overlong + @test textwidth('\xf0\x80\x80') == 1 # malformed for i in 0x00:0x7f # test all ASCII chars (which have fast path) w = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) c = Char(i) From 6b921356be5764c2f249ecec1f883108d1589cfe Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 1 Jun 2025 17:13:01 +0200 Subject: [PATCH 2/2] Fixup: Import is_overlong_enc --- base/strings/unicode.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 438ad1e23fc0b..5dcc41b030473 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -6,7 +6,7 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, convert, isvalid, ismalformed, isoverlong, iterate, AnnotatedString, AnnotatedChar, annotated_chartransform, - @assume_effects, annotations + @assume_effects, annotations, is_overlong_enc # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff