diff --git a/NEWS.md b/NEWS.md index 887c77de12b41..3474b89241e9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -984,6 +984,8 @@ Deprecated or removed predicates for some methods ([#24673] * `ismatch(regex, str)` has been deprecated in favor of `contains(str, regex)` ([#24673]). + Also `ismatch(regex, str, offset)` is deprecated, use `SubString` to get + a desired offset of `str`. * `linspace` and `logspace` now require an explicit number of elements to be supplied rather than defaulting to `50`([#24794], [#24805]). diff --git a/base/deprecated.jl b/base/deprecated.jl index d4f817492a544..0cb9c29f8be04 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1239,6 +1239,7 @@ end @deprecate rsearchindex(s::AbstractString, c::Char, i::Integer) findprev(equalto(c), s, i) @deprecate ismatch(r::Regex, s::AbstractString) contains(s, r) +@deprecate ismatch(r::Regex, s::AbstractString, offset::Integer) contains(SubString(s, offset+1), r) @deprecate findin(a, b) findall(occursin(b), a) diff --git a/base/regex.jl b/base/regex.jl index efc6f91b8d70e..a32688f0e840a 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -141,16 +141,14 @@ function getindex(m::RegexMatch, name::Symbol) end getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)] -function contains(s::AbstractString, r::Regex, offset::Integer=0) +function contains(s::AbstractString, r::Regex) compile(r) - return PCRE.exec(r.regex, String(s), offset, r.match_options, - r.match_data) + return PCRE.exec(r.regex, String(s), 0, r.match_options, r.match_data) end -function contains(s::SubString, r::Regex, offset::Integer=0) +function contains(s::SubString{String}, r::Regex) compile(r) - return PCRE.exec(r.regex, s, offset, r.match_options, - r.match_data) + return PCRE.exec(r.regex, s, 0, r.match_options, r.match_data) end (r::Regex)(s) = contains(s, r) @@ -161,7 +159,7 @@ end Search for the first match of the regular expression `r` in `s` and return a `RegexMatch` object containing the match, or nothing if the match failed. The matching substring can be retrieved by accessing `m.match` and the captured sequences can be retrieved by accessing -`m.captures` The optional `idx` argument specifies an index at which to start the search. +`m.captures`. The optional `idx` argument specifies an index at which to start the search. # Examples ```jldoctest @@ -184,7 +182,8 @@ true """ function match end -function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, add_opts::UInt32=UInt32(0)) +function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, + add_opts::UInt32=UInt32(0)) compile(re) opts = re.match_options | add_opts if !PCRE.exec(re.regex, str, idx-1, opts, re.match_data) @@ -192,18 +191,18 @@ function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, a end ovec = re.ovec n = div(length(ovec),2) - 1 - mat = SubString(str, ovec[1]+1, prevind(str, ovec[2]+1)) + # thisind is needed to correct for the fact that ovec is 0-based + # and that ovec[2] is pointing at the last byte of the match + mat = SubString(str, ovec[1]+1, thisind(str, ovec[2])) cap = Union{Nothing,SubString{String}}[ovec[2i+1] == PCRE.UNSET ? nothing : - SubString(str, ovec[2i+1]+1, - prevind(str, ovec[2i+2]+1)) for i=1:n] + SubString(str, ovec[2i+1]+1, + thisind(str, ovec[2i+2])) for i=1:n] off = Int[ ovec[2i+1]+1 for i=1:n ] RegexMatch(mat, cap, ovec[1]+1, off, re) end match(r::Regex, s::AbstractString) = match(r, s, firstindex(s)) -match(r::Regex, s::AbstractString, i::Integer) = throw(ArgumentError( - "regex matching is only available for the String type; use String(s) to convert" -)) +match(r::Regex, s::AbstractString, i::Integer) = match(r, String(s), i) """ matchall(r::Regex, s::AbstractString; overlap::Bool = false]) -> Vector{AbstractString} @@ -228,11 +227,11 @@ julia> matchall(rx, "a1a2a3a", overlap = true) "a3a" ``` """ -function matchall(re::Regex, str::String; overlap::Bool = false) +function matchall(re::Regex, str::Union{SubString{String},String}; overlap::Bool = false) regex = compile(re).regex n = sizeof(str) matches = SubString{String}[] - offset = UInt32(0) + offset = Csize_t(0) opts = re.match_options opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART prevempty = false @@ -241,7 +240,7 @@ function matchall(re::Regex, str::String; overlap::Bool = false) result = PCRE.exec(regex, str, offset, prevempty ? opts_nonempty : opts, re.match_data) if !result if prevempty && offset < n - offset = UInt32(nextind(str, offset + 1) - 1) + offset = Csize_t(nextind(str, offset + 1) - 1) prevempty = false continue else @@ -249,11 +248,13 @@ function matchall(re::Regex, str::String; overlap::Bool = false) end end - push!(matches, SubString(str, ovec[1]+1, ovec[2])) + # thisind is needed to correct for the fact that ovec is 0-based + # and that ovec[2] is pointing at the last byte of the match + push!(matches, SubString(str, ovec[1]+1, thisind(str, ovec[2]))) prevempty = offset == ovec[2] if overlap if !prevempty - offset = UInt32(ovec[1]+1) + offset = Csize_t(ovec[1]+1) end else offset = ovec[2] @@ -262,23 +263,23 @@ function matchall(re::Regex, str::String; overlap::Bool = false) matches end -matchall(re::Regex, str::SubString; overlap::Bool = false) = +matchall(re::Regex, str::AbstractString; overlap::Bool = false) = matchall(re, String(str), overlap = overlap) # TODO: return only start index and update deprecation -function findnext(re::Regex, str::Union{String,SubString}, idx::Integer) +function findnext(re::Regex, str::Union{String,SubString{String}}, idx::Integer) if idx > nextind(str,lastindex(str)) throw(BoundsError()) end opts = re.match_options compile(re) + # thisind is needed to correct for the fact that ovec is 0-based + # and that ovec[2] is pointing at the last byte of the match PCRE.exec(re.regex, str, idx-1, opts, re.match_data) ? - ((Int(re.ovec[1])+1):prevind(str,Int(re.ovec[2])+1)) : (0:-1) + ((Int(re.ovec[1])+1):thisind(str,Int(re.ovec[2]))) : (0:-1) end -findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError( - "regex search is only available for the String type; use String(s) to convert" -)) -findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s)) +findnext(r::Regex, s::AbstractString, idx::Integer) = findnext(r, String(s), idx) +findfirst(r::Regex, s::AbstractString) = findnext(r, s, firstindex(s)) struct SubstitutionString{T<:AbstractString} <: AbstractString string::T diff --git a/base/strings/search.jl b/base/strings/search.jl index 2b3c0633e07f3..1ccd2f1c5af84 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -426,7 +426,7 @@ julia> findprev("Julia", "JuliaLang", 6) findprev(t::AbstractString, s::AbstractString, i::Integer) = _rsearch(s, t, i) """ - contains(haystack::AbstractString, needle::Union{AbstractString,Regex,Char}) + contains(haystack::AbstractString, needle::Union{AbstractString,Char,Regex}) Determine whether the second argument is a substring of the first. If `needle` is a regular expression, checks whether `haystack` contains a match. diff --git a/test/regex.jl b/test/regex.jl index 67bfd8616dcb8..67970a566b1e7 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -21,6 +21,9 @@ for f in [matchall, collect_eachmatch] @test f(r"GCG","GCGCG",overlap=true) == ["GCG","GCG"] end +# Issue #24157 +@test matchall(r"fé", "café") == ["fé"] + # Issue 8278 target = """71.163.72.113 - - [30/Jul/2014:16:40:55 -0700] "GET emptymind.org/thevacantwall/wp-content/uploads/2013/02/DSC_006421.jpg HTTP/1.1" 200 492513 "http://images.search.yahoo.com/images/view;_ylt=AwrB8py9gdlTGEwADcSjzbkF;_ylu=X3oDMTI2cGZrZTA5BHNlYwNmcC1leHAEc2xrA2V4cARvaWQDNTA3NTRiMzYzY2E5OTEwNjBiMjc2YWJhMjkxMTEzY2MEZ3BvcwM0BGl0A2Jpbmc-?back=http%3A%2F%2Fus.yhs4.search.yahoo.com%2Fyhs%2Fsearch%3Fei%3DUTF-8%26p%3Dapartheid%2Bwall%2Bin%2Bpalestine%26type%3Dgrvydef%26param1%3D1%26param2%3Dsid%253Db01676f9c26355f014f8a9db87545d61%2526b%253DChrome%2526ip%253D71.163.72.113%2526p%253Dgroovorio%2526x%253DAC811262A746D3CD%2526dt%253DS940%2526f%253D7%2526a%253Dgrv_tuto1_14_30%26hsimp%3Dyhs-fullyhosted_003%26hspart%3Dironsource&w=588&h=387&imgurl=occupiedpalestine.files.wordpress.com%2F2012%2F08%2F5-peeking-through-the-wall.jpg%3Fw%3D588%26h%3D387&rurl=http%3A%2F%2Fwww.stopdebezetting.com%2Fwereldpers%2Fcompare-the-berlin-wall-vs-israel-s-apartheid-wall-in-palestine.html&size=49.0KB&name=...+%3Cb%3EApartheid+wall+in+Palestine%3C%2Fb%3E...+%7C+Or+you+go+peeking+through+the+%3Cb%3Ewall%3C%2Fb%3E&p=apartheid+wall+in+palestine&oid=50754b363ca991060b276aba291113cc&fr2=&fr=&tt=...+%3Cb%3EApartheid+wall+in+Palestine%3C%2Fb%3E...+%7C+Or+you+go+peeking+through+the+%3Cb%3Ewall%3C%2Fb%3E&b=0&ni=21&no=4&ts=&tab=organic&sigr=13evdtqdq&sigb=19k7nsjvb&sigi=12o2la1db&sigt=12lia2m0j&sign=12lia2m0j&.crumb=.yUtKgFI6DE&hsimp=yhs-fullyhosted_003&hspart=ironsource" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""" pat = r"""([\d\.]+) ([\w.-]+) ([\w.-]+) (\[.+\]) "([^"\r\n]*|[^"\r\n\[]*\[.+\][^"]+|[^"\r\n]+.[^"]+)" (\d{3}) (\d+|-) ("(?:[^"]|\")+)"? ("(?:[^"]|\")+)"?""" @@ -34,9 +37,13 @@ show(buf, r"") # see #10994, #11447: PCRE2 allows NUL chars in the pattern @test contains("a\0b", Regex("^a\0b\$")) -# regex match / search string must be a String -@test_throws ArgumentError match(r"test", GenericString("this is a test")) -@test_throws ArgumentError findfirst(r"test", GenericString("this is a test")) +# regex match / findfirst string must be a String +@test match(r"test", GenericString("some test")).match == match(r"test", "some test").match +@test findfirst(r"test", GenericString("this is a test")) == findfirst(r"test", "this is a test") +@test findfirst(r"ABC", "") == 0:-1 +@test findnext(r"ABC", "", 1) == 0:-1 +@test findfirst(r"A.C", "_ABC_") == 2:4 +@test findnext(r"A.C", "_ABC_", 3) == 0:-1 # Named subpatterns let m = match(r"(?.)(.)(?.)", "xyz")