Skip to content

Commit

Permalink
Support case-changes to Annotated{String,Char}s
Browse files Browse the repository at this point in the history
Previously, any case changes to Annotated{String,Char} types triggered
"fall back to non-annotated type" non-specialised methods. It would be
nice to keep the annotations though, and that can be done so long as we
keep track of any potential changes to the number of bytes taken by each
character on case changes. This is unusual, but can happen with some
letters (e.g. the upper case of 'ſ' is 'S').

To handle this, a helper function annotated_chartransform is introduced.
This allows for efficient uppercase/lowercase methods (about 50%
overhead in managing the annotation ranges, compared to just
transforming a String). The {upper,lower}casefirst and titlecase
transformations are much more inefficient with this style of
implementation, but not prohibitively so. If somebody has a bright idea,
or they emerge as an area deserving of more attention, the performance
characteristics can be improved.

As a bonus, a specialised textwidth method is implemented to avoid the
generic fallback, providing a ~12x performance improvement.

To check that annotated_chartransform is accurate, as are the
specialised case-transformations, a few million random collections of
strings were pre- and post-annotated and checked to be the same in a
fuzzing check performed with Supposition.jl.

    const short_str = Data.Text(Data.Characters(), max_len=20)
    const short_strs = Data.Vectors(short_str, max_size=10)
    const case_transform_fn = Data.SampledFrom((uppercase, lowercase))

    function annot_caseinvariant(f::Function, strs::Vector{String})
        annot_strs =
            map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]),
                enumerate(strs))
        f_annot_strs =
            map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]),
                enumerate(map(f, strs)))
        pre_join = Base.annotated_chartransform(join(annot_strs), f)
        post_join = join(f_annot_strs)
        pre_join == post_join
    end

    @check max_examples=1_000_000 annot_caseinvariant(case_transform_fn, short_strs)

This helped me determine that in annotated_chartransform the "- 1" was
needed with offset position calculation, and that in the "findlast"
calls that less than *or equal* was the correct equality test.
  • Loading branch information
tecosaur committed Apr 10, 2024
1 parent d183ee1 commit cd05c11
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 1 deletion.
45 changes: 45 additions & 0 deletions base/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,51 @@ Get all annotations of `chr`, in the form of a vector of annotation pairs.
"""
annotations(c::AnnotatedChar) = c.annotations

## Character transformation helper function, c.f. `unicode.jl`.

"""
annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
Transform every character in `str` with `f`, adjusting annotation regions as
appropriate. `f` must take one of two forms, either:
- `f(c::Char) -> Char`, or
- `f(c::Char, state) -> (Char, state)`.
This works by comparing the number of code units of each character before and
after transforming with `f`, recording and aggregating any differences, then
applying them to the annotation regions.
Returns an `AnnotatedString{String}` (regardless of the original underling
string type of `str`).
"""
function annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
outstr = IOBuffer()
annots = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
bytepos = firstindex(str) - 1
offsets = [bytepos => 0]
for c in str.string
oldnb = ncodeunits(c)
bytepos += oldnb
if isnothing(state)
c = f(c)
else
c, state = f(c, state)
end
nb = write(outstr, c)
if nb != oldnb
push!(offsets, bytepos => last(last(offsets)) + nb - oldnb)
end
end
for annot in str.annotations
region, value = annot
start, stop = first(region), last(region)
start_offset = last(offsets[findlast(<=(start) first, offsets)::Int])
stop_offset = last(offsets[findlast(<=(stop) first, offsets)::Int])
push!(annots, ((start + start_offset):(stop + stop_offset), value))
end
AnnotatedString(String(take!(outstr)), annots)
end

## AnnotatedIOBuffer

struct AnnotatedIOBuffer <: AbstractPipe
Expand Down
52 changes: 51 additions & 1 deletion base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
module Unicode

import Base: show, ==, hash, string, Symbol, isless, length, eltype,
convert, isvalid, ismalformed, isoverlong, iterate
convert, isvalid, ismalformed, isoverlong, iterate,
AnnotatedString, AnnotatedChar, annotated_chartransform

# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff

Expand Down Expand Up @@ -271,6 +272,8 @@ julia> textwidth("March")
"""
textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)

textwidth(s::AnnotatedString) = textwidth(s.string)

"""
lowercase(c::AbstractChar)
Expand All @@ -290,6 +293,8 @@ julia> lowercase('Ö')
lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))

lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c))

"""
uppercase(c::AbstractChar)
Expand All @@ -309,6 +314,8 @@ julia> uppercase('ê')
uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))

uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c))

"""
titlecase(c::AbstractChar)
Expand All @@ -332,6 +339,8 @@ julia> uppercase('dž')
titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))

titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))

############################################################################

# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
Expand Down Expand Up @@ -606,6 +615,7 @@ julia> uppercase("Julia")
```
"""
uppercase(s::AbstractString) = map(uppercase, s)
uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s)

"""
lowercase(s::AbstractString)
Expand All @@ -621,6 +631,7 @@ julia> lowercase("STRINGS AND THINGS")
```
"""
lowercase(s::AbstractString) = map(lowercase, s)
lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s)

"""
titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
Expand Down Expand Up @@ -669,6 +680,23 @@ function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Boo
return String(take!(b))
end

# TODO: improve performance characteristics, room for a ~10x improvement.
function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true)
initial_state = (; startword = true, state = Ref{Int32}(0),
c0 = eltype(s)(zero(UInt32)), wordsep, strict)
annotated_chartransform(s, initial_state) do (c, state)
if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c)
state = Base.setindex(state, true, :startword)
cnew = c
else
cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c
state = Base.setindex(state, false, :startword)
end
state = Base.setindex(state, c, :c0)
cnew, state
end
end

"""
uppercasefirst(s::AbstractString) -> String
Expand All @@ -693,6 +721,17 @@ function uppercasefirst(s::AbstractString)
string(c′, SubString(s, nextind(s, 1)))
end

# TODO: improve performance characteristics, room for a ~5x improvement.
function uppercasefirst(s::AnnotatedString)
annotated_chartransform(s, true) do (c, state)
if state
(titlecase(c), false)
else
(c, state)
end
end
end

"""
lowercasefirst(s::AbstractString)
Expand All @@ -715,6 +754,17 @@ function lowercasefirst(s::AbstractString)
string(c′, SubString(s, nextind(s, 1)))
end

# TODO: improve performance characteristics, room for a ~5x improvement.
function lowercasefirst(s::AnnotatedString)
annotated_chartransform(s, true) do (c, state)
if state
(lowercase(c), false)
else
(c, state)
end
end
end

############################################################################
# iterators for grapheme segmentation

Expand Down
27 changes: 27 additions & 0 deletions test/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,33 @@ end
@test reverse(str2) == Base.AnnotatedString("esac", [(2:3, :label => "oomph")])
end

@testset "Unicode" begin
for words in (["ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE", "Сodeunıts"],
["Сodeunıts", "ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE"])
ann_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(words)]
ann_str = join(ann_words, '-')
for transform in (lowercase, uppercase, titlecase)
t_words = map(transform, words)
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(t_words)]
ann_t_str = join(ann_t_words, '-')
t_ann_str = transform(ann_str)
@test String(ann_t_str) == String(t_ann_str)
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
end
for transform in (uppercasefirst, lowercasefirst)
t_words = vcat(transform(first(words)), words[2:end])
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
for (i, w) in enumerate(t_words)]
ann_t_str = join(ann_t_words, '-')
t_ann_str = transform(ann_str)
@test String(ann_t_str) == String(t_ann_str)
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
end
end
end

@testset "AnnotatedIOBuffer" begin
aio = Base.AnnotatedIOBuffer()
# Append-only writing
Expand Down

0 comments on commit cd05c11

Please sign in to comment.