Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve regex search functions signatures #24116

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,8 @@ Deprecated or removed
predicates for some methods ([#24673]

* `ismatch(regex, str)` has been deprecated in favor of `contains(str, regex)` ([#24673]).
Also `ismatch(regex, str, offset)` is deprecated, use `SubString` to get
a desired offset of `str`.

* `linspace` and `logspace` now require an explicit number of elements to be
supplied rather than defaulting to `50`([#24794], [#24805]).
Expand Down
8 changes: 8 additions & 0 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1240,6 +1240,14 @@ end

@deprecate ismatch(r::Regex, s::AbstractString) contains(s, r)

function ismatch(r::Regex, s::AbstractString, offset::Integer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't @deprecate ismatch(r::Regex, s::AbstractString, offset::Integer) contains(SubString(s, offset+1), r) be enough?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would :). Fixing

depwarn("`ismatch(regex, str, offset)` is deprecated. " *
"You can use `contains(SubString(str, offset+1), regex)` instead.", :ismatch)
compile(r)
return PCRE.exec(r.regex, String(s), offset, r.match_options,
r.match_data)
end

@deprecate findin(a, b) findall(occursin(b), a)

@deprecate find findall
Expand Down
53 changes: 27 additions & 26 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,14 @@ function getindex(m::RegexMatch, name::Symbol)
end
getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)]

function contains(s::AbstractString, r::Regex, offset::Integer=0)
function contains(s::AbstractString, r::Regex)
compile(r)
return PCRE.exec(r.regex, String(s), offset, r.match_options,
r.match_data)
return PCRE.exec(r.regex, String(s), 0, r.match_options, r.match_data)
end

function contains(s::SubString, r::Regex, offset::Integer=0)
function contains(s::SubString{String}, r::Regex)
compile(r)
return PCRE.exec(r.regex, s, offset, r.match_options,
r.match_data)
return PCRE.exec(r.regex, s, 0, r.match_options, r.match_data)
end

(r::Regex)(s) = contains(s, r)
Expand All @@ -161,7 +159,7 @@ end
Search for the first match of the regular expression `r` in `s` and return a `RegexMatch`
object containing the match, or nothing if the match failed. The matching substring can be
retrieved by accessing `m.match` and the captured sequences can be retrieved by accessing
`m.captures` The optional `idx` argument specifies an index at which to start the search.
`m.captures`. The optional `idx` argument specifies an index at which to start the search.

# Examples
```jldoctest
Expand All @@ -184,26 +182,27 @@ true
"""
function match end

function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, add_opts::UInt32=UInt32(0))
function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer,
add_opts::UInt32=UInt32(0))
compile(re)
opts = re.match_options | add_opts
if !PCRE.exec(re.regex, str, idx-1, opts, re.match_data)
return nothing
end
ovec = re.ovec
n = div(length(ovec),2) - 1
mat = SubString(str, ovec[1]+1, prevind(str, ovec[2]+1))
# thisind is needed to correct for the fact that ovec is 0-based
# and that ovec[2] is pointing at the last byte of the match
mat = SubString(str, ovec[1]+1, thisind(str, ovec[2]))
cap = Union{Nothing,SubString{String}}[ovec[2i+1] == PCRE.UNSET ? nothing :
SubString(str, ovec[2i+1]+1,
prevind(str, ovec[2i+2]+1)) for i=1:n]
SubString(str, ovec[2i+1]+1,
thisind(str, ovec[2i+2])) for i=1:n]
off = Int[ ovec[2i+1]+1 for i=1:n ]
RegexMatch(mat, cap, ovec[1]+1, off, re)
end

match(r::Regex, s::AbstractString) = match(r, s, firstindex(s))
match(r::Regex, s::AbstractString, i::Integer) = throw(ArgumentError(
"regex matching is only available for the String type; use String(s) to convert"
))
match(r::Regex, s::AbstractString, i::Integer) = match(r, String(s), i)

"""
matchall(r::Regex, s::AbstractString; overlap::Bool = false]) -> Vector{AbstractString}
Expand All @@ -228,11 +227,11 @@ julia> matchall(rx, "a1a2a3a", overlap = true)
"a3a"
```
"""
function matchall(re::Regex, str::String; overlap::Bool = false)
function matchall(re::Regex, str::Union{SubString{String},String}; overlap::Bool = false)
regex = compile(re).regex
n = sizeof(str)
matches = SubString{String}[]
offset = UInt32(0)
offset = Csize_t(0)
opts = re.match_options
opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
prevempty = false
Expand All @@ -241,19 +240,21 @@ function matchall(re::Regex, str::String; overlap::Bool = false)
result = PCRE.exec(regex, str, offset, prevempty ? opts_nonempty : opts, re.match_data)
if !result
if prevempty && offset < n
offset = UInt32(nextind(str, offset + 1) - 1)
offset = Csize_t(nextind(str, offset + 1) - 1)
prevempty = false
continue
else
break
end
end

push!(matches, SubString(str, ovec[1]+1, ovec[2]))
# thisind is needed to correct for the fact that ovec is 0-based
# and that ovec[2] is pointing at the last byte of the match
push!(matches, SubString(str, ovec[1]+1, thisind(str, ovec[2])))
prevempty = offset == ovec[2]
if overlap
if !prevempty
offset = UInt32(ovec[1]+1)
offset = Csize_t(ovec[1]+1)
end
else
offset = ovec[2]
Expand All @@ -262,23 +263,23 @@ function matchall(re::Regex, str::String; overlap::Bool = false)
matches
end

matchall(re::Regex, str::SubString; overlap::Bool = false) =
matchall(re::Regex, str::AbstractString; overlap::Bool = false) =
matchall(re, String(str), overlap = overlap)

# TODO: return only start index and update deprecation
function findnext(re::Regex, str::Union{String,SubString}, idx::Integer)
function findnext(re::Regex, str::Union{String,SubString{String}}, idx::Integer)
if idx > nextind(str,lastindex(str))
throw(BoundsError())
end
opts = re.match_options
compile(re)
# thisind is needed to correct for the fact that ovec is 0-based
# and that ovec[2] is pointing at the last byte of the match
PCRE.exec(re.regex, str, idx-1, opts, re.match_data) ?
((Int(re.ovec[1])+1):prevind(str,Int(re.ovec[2])+1)) : (0:-1)
((Int(re.ovec[1])+1):thisind(str,Int(re.ovec[2]))) : (0:-1)
end
findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError(
"regex search is only available for the String type; use String(s) to convert"
))
findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s))
findnext(r::Regex, s::AbstractString, idx::Integer) = findnext(r, String(s), idx)
findfirst(r::Regex, s::AbstractString) = findnext(r, s, firstindex(s))

struct SubstitutionString{T<:AbstractString} <: AbstractString
string::T
Expand Down
2 changes: 1 addition & 1 deletion base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ julia> findprev("Julia", "JuliaLang", 6)
findprev(t::AbstractString, s::AbstractString, i::Integer) = _rsearch(s, t, i)

"""
contains(haystack::AbstractString, needle::Union{AbstractString,Regex,Char})
contains(haystack::AbstractString, needle::Union{AbstractString,Char,Regex})

Determine whether the second argument is a substring of the first. If `needle`
is a regular expression, checks whether `haystack` contains a match.
Expand Down
13 changes: 10 additions & 3 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ for f in [matchall, collect_eachmatch]
@test f(r"GCG","GCGCG",overlap=true) == ["GCG","GCG"]
end

# Issue #24157
@test matchall(r"fé", "café") == ["fé"]

# Issue 8278
target = """71.163.72.113 - - [30/Jul/2014:16:40:55 -0700] "GET emptymind.org/thevacantwall/wp-content/uploads/2013/02/DSC_006421.jpg HTTP/1.1" 200 492513 "http://images.search.yahoo.com/images/view;_ylt=AwrB8py9gdlTGEwADcSjzbkF;_ylu=X3oDMTI2cGZrZTA5BHNlYwNmcC1leHAEc2xrA2V4cARvaWQDNTA3NTRiMzYzY2E5OTEwNjBiMjc2YWJhMjkxMTEzY2MEZ3BvcwM0BGl0A2Jpbmc-?back=http%3A%2F%2Fus.yhs4.search.yahoo.com%2Fyhs%2Fsearch%3Fei%3DUTF-8%26p%3Dapartheid%2Bwall%2Bin%2Bpalestine%26type%3Dgrvydef%26param1%3D1%26param2%3Dsid%253Db01676f9c26355f014f8a9db87545d61%2526b%253DChrome%2526ip%253D71.163.72.113%2526p%253Dgroovorio%2526x%253DAC811262A746D3CD%2526dt%253DS940%2526f%253D7%2526a%253Dgrv_tuto1_14_30%26hsimp%3Dyhs-fullyhosted_003%26hspart%3Dironsource&w=588&h=387&imgurl=occupiedpalestine.files.wordpress.com%2F2012%2F08%2F5-peeking-through-the-wall.jpg%3Fw%3D588%26h%3D387&rurl=http%3A%2F%2Fwww.stopdebezetting.com%2Fwereldpers%2Fcompare-the-berlin-wall-vs-israel-s-apartheid-wall-in-palestine.html&size=49.0KB&name=...+%3Cb%3EApartheid+wall+in+Palestine%3C%2Fb%3E...+%7C+Or+you+go+peeking+through+the+%3Cb%3Ewall%3C%2Fb%3E&p=apartheid+wall+in+palestine&oid=50754b363ca991060b276aba291113cc&fr2=&fr=&tt=...+%3Cb%3EApartheid+wall+in+Palestine%3C%2Fb%3E...+%7C+Or+you+go+peeking+through+the+%3Cb%3Ewall%3C%2Fb%3E&b=0&ni=21&no=4&ts=&tab=organic&sigr=13evdtqdq&sigb=19k7nsjvb&sigi=12o2la1db&sigt=12lia2m0j&sign=12lia2m0j&.crumb=.yUtKgFI6DE&hsimp=yhs-fullyhosted_003&hspart=ironsource" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
pat = r"""([\d\.]+) ([\w.-]+) ([\w.-]+) (\[.+\]) "([^"\r\n]*|[^"\r\n\[]*\[.+\][^"]+|[^"\r\n]+.[^"]+)" (\d{3}) (\d+|-) ("(?:[^"]|\")+)"? ("(?:[^"]|\")+)"?"""
Expand All @@ -34,9 +37,13 @@ show(buf, r"")
# see #10994, #11447: PCRE2 allows NUL chars in the pattern
@test contains("a\0b", Regex("^a\0b\$"))

# regex match / search string must be a String
@test_throws ArgumentError match(r"test", GenericString("this is a test"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are now supposed to work, better keep them and check that they return the correct result.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

@test_throws ArgumentError findfirst(r"test", GenericString("this is a test"))
# regex match / findfirst string must be a String
@test match(r"test", GenericString("some test")).match == match(r"test", "some test").match
@test findfirst(r"test", GenericString("this is a test")) == findfirst(r"test", "this is a test")
@test findfirst(r"ABC", "") == 0:-1
@test findnext(r"ABC", "", 1) == 0:-1
@test findfirst(r"A.C", "_ABC_") == 2:4
@test findnext(r"A.C", "_ABC_", 3) == 0:-1

# Named subpatterns
let m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
Expand Down