Skip to content

Commit

Permalink
implement replace on String for multiple patterns
Browse files Browse the repository at this point in the history
This has been attempted before, sometimes fairly similar to this, but
the attempts seemed to be either too simple or too complicated. This
aims to be simple, and even beats one of the "handwritten" benchmark
cases.

Past issues (e.g. JuliaLang#25396) have proposed that using Regex may be faster,
but in my tests, this handily bests even simplified regexes. There can
be slow Regexes patterns that can cause this to exhibit O(n^2) behavior,
but only if the one of the earlier patterns is a partial match for a
later pattern Regex and that Regex always matches O(n) of the input
stream. This is a case that is hopefully usually avoidable in practice.

fixes JuliaLang#35327
fixes JuliaLang#39061
fixes JuliaLang#35414
fixes JuliaLang#29849
fixes JuliaLang#30457
fixes JuliaLang#25396
  • Loading branch information
vtjnash committed Apr 14, 2021
1 parent 7112c89 commit 926439b
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 29 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ Standard library changes
@test isequal(complex(one(T)) / complex(T(Inf), T(-Inf)), complex(zero(T), zero(T))) broken=(T == Float64)
```
([#39322])
* `replace(::String)` now allows multiple patterns to be specified, and they
will be applied left-to-right simultaneously, so only one pattern will be
applied to any character, and the patterns will only be applied to the input
text, not the replacements. ([#TBD])


#### Package Manager

Expand Down
1 change: 0 additions & 1 deletion base/set.jl
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,6 @@ replace!(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace!,
replace!(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace!, (a, b, c)))
replace(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b)))
replace(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b, c)))
replace(a::AbstractString, b::Pair, c::Pair) = throw(MethodError(replace, (a, b, c)))

### replace! for AbstractDict/AbstractSet

Expand Down
82 changes: 54 additions & 28 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -510,56 +510,72 @@ _replace(io, repl::Function, str, r, pattern) =
_replace(io, repl::Function, str, r, pattern::Function) =
print(io, repl(str[first(r)]))

replace(str::String, pat_repl::Pair{<:AbstractChar}; count::Integer=typemax(Int)) =
replace(str, isequal(first(pat_repl)) => last(pat_repl); count=count)

replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{AbstractChar}},
AbstractVector{<:AbstractChar},Set{<:AbstractChar}}};
count::Integer=typemax(Int)) =
replace(str, in(first(pat_repl)) => last(pat_repl), count=count)

_pat_replacer(x) = x
_free_pat_replacer(x) = nothing

function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int))
pattern, repl = pat_repl
_pat_replacer(x::AbstractChar) = isequal(x)
_pat_replacer(x::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}) = in(x)

function replace(str::String, pat_repl::Vararg{Pair,N}; count::Integer=typemax(Int)) where N
count == 0 && return str
count < 0 && throw(DomainError(count, "`count` must be non-negative."))
n = 1
e = lastindex(str)
e1 = nextind(str, lastindex(str)) # sizeof(str)
i = a = firstindex(str)
pattern = _pat_replacer(pattern)
r = something(findnext(pattern,str,i), 0)
j, k = first(r), last(r)
if j == 0
_free_pat_replacer(pattern)
patterns = map(p -> _pat_replacer(first(p)), pat_repl)
replaces = map(last, pat_repl)
rs = map(patterns) do p
r = findnext(p, str, a)
if r === nothing || first(r) == 0
return e1+1:0
end
return r
end
if all(>(e1), map(first, rs))
foreach(_free_pat_replacer, patterns)
return str
end
out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str)))
while j != 0
while true
p = argmin(map(first, rs)) # TODO: or argmin(rs), to pick the shortest first match ?
r = rs[p]
j, k = first(r), last(r)
j > e1 && break
if i == a || i <= k
# copy out preserved portion
GC.@preserve str unsafe_write(out, pointer(str, i), UInt(j-i))
_replace(out, repl, str, r, pattern)
# copy out replacement string
_replace(out, replaces[p], str, r, patterns[p])
end
if k < j
i = j
j > e && break
j == e1 && break
k = nextind(str, j)
else
i = k = nextind(str, k)
end
r = something(findnext(pattern,str,k), 0)
r === 0:-1 || n == count && break
j, k = first(r), last(r)
n == count && break
let k = k
rs = map(patterns, rs) do p, r
if first(r) < k
r = findnext(p, str, k)
if r === nothing || first(r) == 0
return e1+1:0
end
end
return r
end
end
n += 1
end
_free_pat_replacer(pattern)
write(out, SubString(str,i))
String(take!(out))
foreach(_free_pat_replacer, patterns)
write(out, SubString(str, i))
return String(take!(out))
end


"""
replace(s::AbstractString, pat=>r; [count::Integer])
replace(s::AbstractString, pat=>r, [pat2=>r2, ...]; [count::Integer])
Search for the given pattern `pat` in `s`, and replace each occurrence with `r`.
If `count` is provided, replace at most `count` occurrences.
Expand All @@ -572,6 +588,13 @@ If `pat` is a regular expression and `r` is a [`SubstitutionString`](@ref), then
references in `r` are replaced with the corresponding matched text.
To remove instances of `pat` from `string`, set `r` to the empty `String` (`""`).
Multiple patterns can be specified, and they will be applied left-to-right
simultaneously, so only one pattern will be applied to any character, and the
patterns will only be applied to the input text, not the replacements.
!!! compat "Julia 1.7"
Support for multiple patterns requires version 1.7.
# Examples
```jldoctest
julia> replace("Python is a programming language.", "Python" => "Julia")
Expand All @@ -585,10 +608,13 @@ julia> replace("The quick foxes run quickly.", "quick" => "", count=1)
julia> replace("The quick foxes run quickly.", r"fox(es)?" => s"bus\\1")
"The quick buses run quickly."
julia> replace("abcabc", "a" => "b", "b" => "c", r".+" => "a")
"bca"
```
"""
replace(s::AbstractString, pat_f::Pair; count=typemax(Int)) =
replace(String(s), pat_f, count=count)
replace(s::AbstractString, pat_f::Pair...; count=typemax(Int)) =
replace(String(s), pat_f..., count=count)

# TODO: allow transform as the first argument to replace?

Expand Down
162 changes: 162 additions & 0 deletions test/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,168 @@ end

end

@testset "replace many" begin
# PR 35414 Francesco Alemanno <francescoalemanno710@gmail.com>
@test replace("foobarbaz", "oo"=>"zz", "ar"=>"zz", "z"=>"m") == "fzzbzzbam"
substmp=["z"=>"m", "oo"=>"zz", "ar"=>"zz"]
for perm in [[1, 2, 3], [2, 1, 3], [3, 2, 1], [2, 3, 1], [1, 3, 2], [3, 1, 2]]
@test replace("foobarbaz",substmp[perm]...) == "fzzbzzbam"
@test replace("foobarbaz",substmp[perm]...,count=2) == "fzzbzzbaz"
@test replace("foobarbaz",substmp[perm]...,count=1) == "fzzbarbaz"
end
@test replace("foobarbaz", "z"=>"m", r"a.*a"=>uppercase) == "foobARBAm"
@test replace("foobarbaz", 'o'=>'z', 'a'=>'q', 'z'=>'m') == "fzzbqrbqm"


# PR #25732 Klaus Crusius <klaus.crusius@web.de>
@test replace("\u2202", '*' => '\0', ""=>"") == "\u2202"

@test replace("foobar", 'o' => '0', ""=>"") == "f00bar"
@test replace("foobar", 'o' => '0', count=1, ""=>"") == "f0obar"
@test replace("foobar", 'o' => "", ""=>"") == "fbar"
@test replace("foobar", 'o' => "", count=1, ""=>"") == "fobar"
@test replace("foobar", 'f' => 'F', ""=>"") == "Foobar"
@test replace("foobar", 'r' => 'R', ""=>"") == "foobaR"

@test replace("foofoofoo", "foo" => "bar", ""=>"") == "barbarbar"
@test replace("foobarfoo", "foo" => "baz", ""=>"") == "bazbarbaz"
@test replace("barfoofoo", "foo" => "baz", ""=>"") == "barbazbaz"

@test replace("", "" => "", ""=>"") == ""
@test replace("", "" => "x", ""=>"") == "x"
@test replace("", "x" => "y", ""=>"") == ""

@test replace("abcd", "" => "^", ""=>"") == "^a^b^c^d^"
@test replace("abcd", "b" => "^", ""=>"") == "a^cd"
@test replace("abcd", r"b?" => "^", ""=>"") == "^a^c^d^"
@test replace("abcd", r"b+" => "^", ""=>"") == "a^cd"
@test replace("abcd", r"b?c?" => "^", ""=>"") == "^a^d^"
@test replace("abcd", r"[bc]?" => "^", ""=>"") == "^a^^d^"

@test replace("foobarfoo", r"(fo|ba)" => "xx", ""=>"") == "xxoxxrxxo"
@test replace("foobarfoo", r"(foo|ba)" => "bar", ""=>"") == "barbarrbar"

@test replace("foobar", 'o' => 'ø', ""=>"") == "føøbar"
@test replace("foobar", 'o' => 'ø', count=1, ""=>"") == "føobar"
@test replace("føøbar", 'ø' => 'o', ""=>"") == "foobar"
@test replace("føøbar", 'ø' => 'o', count=1, ""=>"") == "foøbar"
@test replace("føøbar", 'ø' => 'ö', ""=>"") == "fööbar"
@test replace("føøbar", 'ø' => 'ö', count=1, ""=>"") == "föøbar"
@test replace("føøbar", 'ø' => "", ""=>"") == "fbar"
@test replace("føøbar", 'ø' => "", count=1, ""=>"") == "føbar"
@test replace("føøbar", 'f' => 'F', ""=>"") == "Føøbar"
@test replace("ḟøøbar", '' => 'F', ""=>"") == "Føøbar"
@test replace("føøbar", 'f' => '', ""=>"") == "Ḟøøbar"
@test replace("ḟøøbar", '' => '', ""=>"") == "Ḟøøbar"
@test replace("føøbar", 'r' => 'R', ""=>"") == "føøbaR"
@test replace("føøbaṙ", '' => 'R', ""=>"") == "føøbaR"
@test replace("føøbar", 'r' => '', ""=>"") == "føøbaṘ"
@test replace("føøbaṙ", '' => '', ""=>"") == "føøbaṘ"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", ""=>"") == "barbarbar"
@test replace("ḟøøbarḟøø", "ḟøø" => "baz", ""=>"") == "bazbarbaz"
@test replace("barḟøøḟøø", "ḟøø" => "baz", ""=>"") == "barbazbaz"

@test replace("foofoofoo", "foo" => "ƀäṙ", ""=>"") == "ƀäṙƀäṙƀäṙ"
@test replace("fooƀäṙfoo", "foo" => "baz", ""=>"") == "bazƀäṙbaz"
@test replace("ƀäṙfoofoo", "foo" => "baz", ""=>"") == "ƀäṙbazbaz"

@test replace("foofoofoo", "foo" => "bar", ""=>"") == "barbarbar"
@test replace("foobarfoo", "foo" => "ƀäż", ""=>"") == "ƀäżbarƀäż"
@test replace("barfoofoo", "foo" => "ƀäż", ""=>"") == "barƀäżƀäż"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", ""=>"") == "ƀäṙƀäṙƀäṙ"
@test replace("ḟøøƀäṙḟøø", "ḟøø" => "baz", ""=>"") == "bazƀäṙbaz"
@test replace("ƀäṙḟøøḟøø", "ḟøø" => "baz", ""=>"") == "ƀäṙbazbaz"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", ""=>"") == "barbarbar"
@test replace("ḟøøbarḟøø", "ḟøø" => "ƀäż", ""=>"") == "ƀäżbarƀäż"
@test replace("barḟøøḟøø", "ḟøø" => "ƀäż", ""=>"") == "barƀäżƀäż"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", ""=>"") == "ƀäṙƀäṙƀäṙ"
@test replace("ḟøøƀäṙḟøø", "ḟøø" => "ƀäż", ""=>"") == "ƀäżƀäṙƀäż"
@test replace("ƀäṙḟøøḟøø", "ḟøø" => "ƀäż", ""=>"") == "ƀäṙƀäżƀäż"

@test replace("", "" => "", ""=>"") == ""
@test replace("", "" => "ÿ", ""=>"") == ""

@test replace("äƀçđ", "" => "π", ""=>"") == "πäπƀπçπđπ"
@test replace("äƀçđ", "ƀ" => "π", ""=>"") == "äπçđ"
@test replace("äƀçđ", r"ƀ?" => "π", ""=>"") == "πäπçπđπ"
@test replace("äƀçđ", r"ƀ+" => "π", ""=>"") == "äπçđ"
@test replace("äƀçđ", r"ƀ?ç?" => "π", ""=>"") == "πäπđπ"
@test replace("äƀçđ", r"[ƀç]?" => "π", ""=>"") == "πäππđπ"

@test replace("foobarfoo", r"(fo|ba)" => "ẍẍ", ""=>"") == "ẍẍoẍẍrẍẍo"

@test replace("ḟøøbarḟøø", r"(ḟø|ba)" => "xx", ""=>"") == "xxøxxrxxø"
@test replace("ḟøøbarḟøø", r"(ḟøø|ba)" => "bar", ""=>"") == "barbarrbar"

@test replace("fooƀäṙfoo", r"(fo|ƀä)" => "xx", ""=>"") == "xxoxxṙxxo"
@test replace("fooƀäṙfoo", r"(foo|ƀä)" => "ƀäṙ", ""=>"") == "ƀäṙƀäṙṙƀäṙ"

@test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)" => "xx", ""=>"") == "xxøxxṙxxø"
@test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)" => "ƀäṙ", ""=>"") == "ƀäṙƀäṙṙƀäṙ"

@test replace("foo", "oo" => uppercase, ""=>"") == "fOO"

# Issue 13332
@test replace("abc", 'b' => 2.1, ""=>"") == "a2.1c"

# test replace with a count for String and GenericString
# check that replace is a no-op if count==0
for s in ["aaa", Test.GenericString("aaa")]
@test_throws DomainError replace(s, 'a' => "", count = -1, ""=>"")
@test replace(s, 'a' => 'z', count=0, ""=>"") === s
@test replace(s, 'a' => 'z', count=1, ""=>"") == "zaa"
@test replace(s, 'a' => 'z', count=2, ""=>"") == "zza"
@test replace(s, 'a' => 'z', count=3, ""=>"") == "zzz"
@test replace(s, 'a' => 'z', count=4, ""=>"") == "zzz"
@test replace(s, 'a' => 'z', count=typemax(Int), ""=>"") == "zzz"
@test replace(s, 'a' => 'z', ""=>"") == "zzz"
end

for s in ["abc"]
@test replace(s) === s
@test replace(s, 'a' => 'z', ""=>"") === "zbc"
@test replace(s, 'a' => 'z', 'b' => 'y') == "zyc"
@test replace(s, 'a' => 'z', 'c' => 'x', "b" => 'y') == "zyx"
@test replace(s, '1' => 'z', ""=>"") == s
@test replace(s, 'b' => "BbB", ""=>"", count=1) == "aBbBc"
end

for s in ["quick quicker quickest"]
@test replace(s) === s
@test replace(s, "quick" => 'a', "quicker" => uppercase, "quickest" => 'z') == "a QUICKER z"
@test replace(s, "quick"=>"Duck", "quicker"=>"is", "quickest"=>"lame", count=2) == "Duck is quickest"
@test replace(s, "" => '1', ""=>"") == "1q1u1i1c1k1 1q1u1i1c1k1e1r1 1q1u1i1c1k1e1s1t1"
@test replace(s, "qu" => "QU", "qu" => "never happens", "ick" => "") == "QU QUer QUest"
@test replace(s, " " => '_', "r " => "r-") == "quick_quicker-quickest"
@test replace(s, r"[aeiou]" => "ä", "ui" => "ki", "i" => "I") == "qkick qkickär qkickäst"
@test replace(s, r"[^ ]+" => "word", "quicker " => "X", count=big"99") == "word Xword"

@test replace(s, r"(quick)(e)"=>s"\2-\1", "x"=>"X") == "quick e-quickr e-quickst"

@test replace(s, 'q'=>'Q', 'u'=>'U') == "QUick QUicker QUickest"
@test replace(s, 'q'=>'Q', r"u"=>'U') == "QUick QUicker QUickest"
@test replace(s, 'q'=>'Q', equalto('u')=>uppercase) == "QUick QUicker QUickest"
@test replace(s, 'q'=>'Q', islower=>'-') == "Q---- Q------ Q-------"
@test replace(s, ['q', 'u']=>'K') == "KKick KKicker KKickest"
@test replace(s, occursin("uq")=>'K') == "KKick KKicker KKickest"
@test replace(s, equalto('q')=>"B") == "Buick Buicker Buickest"

@test replace(s, "qui"=>"A", 'r'=>'R') == "Ack AckeR Ackest"
@test replace(s, 'r'=>'x', islower=>uppercase) == "QUICK QUICKEx QUICKEST"
@test replace(s, islower=>uppercase, 'r'=>'x') == "QUICK QUICKER QUICKEST"
@test replace(s, "q"=>"z", islower=>uppercase, 'r'=>'x') == "zUICK zUICKER zUICKEST"
@test replace(s, "qui"=>"A", 'r'=>'x', islower=>uppercase) == "ACK ACKEx ACKEST"
@test replace(s, "qui"=>"A", 'r'=>'x', islower=>uppercase) == "ACK ACKEx ACKEST"
@test replace(s, r"q"=>"z", islower=>uppercase, 'r'=>'x') == "zUICK zUICKER zUICKEST"
@test_throws ErrorException("type String has no field match_data") replace(s, "q"=>s"a\1b")
@test_throws ErrorException("PCRE error: unknown substring") replace(s, r"q"=>s"a\1b")
end
end

@testset "chomp/chop" begin
@test chomp("foo\n") == "foo"
@test chomp("fo∀\n") == "fo∀"
Expand Down

0 comments on commit 926439b

Please sign in to comment.