Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support case-changes to Annotated{String,Char}s #54013

Merged
merged 1 commit into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions base/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,51 @@ Get all annotations of `chr`, in the form of a vector of annotation pairs.
"""
annotations(c::AnnotatedChar) = c.annotations

## Character transformation helper function, c.f. `unicode.jl`.

"""
annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)

Transform every character in `str` with `f`, adjusting annotation regions as
appropriate. `f` must take one of two forms, either:
- `f(c::Char) -> Char`, or
- `f(c::Char, state) -> (Char, state)`.

This works by comparing the number of code units of each character before and
after transforming with `f`, recording and aggregating any differences, then
applying them to the annotation regions.

Returns an `AnnotatedString{String}` (regardless of the original underling
string type of `str`).
"""
function annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
outstr = IOBuffer()
annots = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
bytepos = firstindex(str) - 1
offsets = [bytepos => 0]
for c in str.string
oldnb = ncodeunits(c)
bytepos += oldnb
if isnothing(state)
c = f(c)
else
c, state = f(c, state)
end
nb = write(outstr, c)
if nb != oldnb
push!(offsets, bytepos => last(last(offsets)) + nb - oldnb)
end
end
for annot in str.annotations
region, value = annot
start, stop = first(region), last(region)
start_offset = last(offsets[findlast(<=(start) ∘ first, offsets)::Int])
stop_offset = last(offsets[findlast(<=(stop) ∘ first, offsets)::Int])
push!(annots, ((start + start_offset):(stop + stop_offset), value))
end
AnnotatedString(String(take!(outstr)), annots)
end

## AnnotatedIOBuffer

struct AnnotatedIOBuffer <: AbstractPipe
Expand Down
52 changes: 51 additions & 1 deletion base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
module Unicode

import Base: show, ==, hash, string, Symbol, isless, length, eltype,
convert, isvalid, ismalformed, isoverlong, iterate
convert, isvalid, ismalformed, isoverlong, iterate,
AnnotatedString, AnnotatedChar, annotated_chartransform

# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff

Expand Down Expand Up @@ -271,6 +272,8 @@ julia> textwidth("March")
"""
textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)

textwidth(s::AnnotatedString) = textwidth(s.string)

"""
lowercase(c::AbstractChar)

Expand All @@ -290,6 +293,8 @@ julia> lowercase('Ö')
lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))

lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c))

"""
uppercase(c::AbstractChar)

Expand All @@ -309,6 +314,8 @@ julia> uppercase('ê')
uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))

uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c))

"""
titlecase(c::AbstractChar)

Expand All @@ -332,6 +339,8 @@ julia> uppercase('dž')
titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))

titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))

############################################################################

# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
Expand Down Expand Up @@ -606,6 +615,7 @@ julia> uppercase("Julia")
```
"""
uppercase(s::AbstractString) = map(uppercase, s)
uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s)

"""
lowercase(s::AbstractString)
Expand All @@ -621,6 +631,7 @@ julia> lowercase("STRINGS AND THINGS")
```
"""
lowercase(s::AbstractString) = map(lowercase, s)
lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s)

"""
titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
Expand Down Expand Up @@ -669,6 +680,23 @@ function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Boo
return String(take!(b))
end

# TODO: improve performance characteristics, room for a ~10x improvement.
function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true)
initial_state = (; startword = true, state = Ref{Int32}(0),
c0 = eltype(s)(zero(UInt32)), wordsep, strict)
annotated_chartransform(s, initial_state) do c, state
if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c)
state = Base.setindex(state, true, :startword)
cnew = c
else
cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c
state = Base.setindex(state, false, :startword)
end
state = Base.setindex(state, c, :c0)
cnew, state
end
end

"""
uppercasefirst(s::AbstractString) -> String

Expand All @@ -693,6 +721,17 @@ function uppercasefirst(s::AbstractString)
string(c′, SubString(s, nextind(s, 1)))
end

# TODO: improve performance characteristics, room for a ~5x improvement.
function uppercasefirst(s::AnnotatedString)
annotated_chartransform(s, true) do c, state
if state
(titlecase(c), false)
else
(c, state)
end
end
end

"""
lowercasefirst(s::AbstractString)

Expand All @@ -715,6 +754,17 @@ function lowercasefirst(s::AbstractString)
string(c′, SubString(s, nextind(s, 1)))
end

# TODO: improve performance characteristics, room for a ~5x improvement.
function lowercasefirst(s::AnnotatedString)
annotated_chartransform(s, true) do c, state
if state
(lowercase(c), false)
else
(c, state)
end
end
end

############################################################################
# iterators for grapheme segmentation

Expand Down
27 changes: 27 additions & 0 deletions test/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,33 @@ end
@test reverse(str2) == Base.AnnotatedString("esac", [(2:3, :label => "oomph")])
end

@testset "Unicode" begin
for words in (["ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE", "Сodeunıts"],
["Сodeunıts", "ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE"])
ann_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(words)]
ann_str = join(ann_words, '-')
for transform in (lowercase, uppercase, titlecase)
t_words = map(transform, words)
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(t_words)]
ann_t_str = join(ann_t_words, '-')
t_ann_str = transform(ann_str)
@test String(ann_t_str) == String(t_ann_str)
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
end
for transform in (uppercasefirst, lowercasefirst)
t_words = vcat(transform(first(words)), words[2:end])
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(t_words)]
ann_t_str = join(ann_t_words, '-')
t_ann_str = transform(ann_str)
@test String(ann_t_str) == String(t_ann_str)
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
end
end
end

@testset "AnnotatedIOBuffer" begin
aio = Base.AnnotatedIOBuffer()
# Append-only writing
Expand Down