From b5b286c378b57b8a2eb2a9546c466954cdd20600 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Thu, 20 Jun 2019 14:59:36 -0400 Subject: [PATCH] fix #31521, make regexes thread-safe --- base/client.jl | 4 ++ base/pcre.jl | 73 ++++++++++++++++++++++++----------- base/regex.jl | 91 ++++++++++++++++++++++++++------------------ base/strings/util.jl | 5 +++ 4 files changed, 114 insertions(+), 59 deletions(-) diff --git a/base/client.jl b/base/client.jl index a4a6dc4f34b31..878a79c3dc8de 100644 --- a/base/client.jl +++ b/base/client.jl @@ -460,6 +460,10 @@ MainInclude.include function _start() empty!(ARGS) append!(ARGS, Core.ARGS) + if ccall(:jl_generating_output, Cint, ()) != 0 && JLOptions().incremental == 0 + # clear old invalid pointers + PCRE.__init__() + end try exec_options(JLOptions()) catch diff --git a/base/pcre.jl b/base/pcre.jl index 7aa91076b84ad..d5c4d4fed66a6 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -10,24 +10,36 @@ include(string(length(Core.ARGS) >= 2 ? Core.ARGS[2] : "", "pcre_h.jl")) # incl const PCRE_LIB = "libpcre2-8" -const JIT_STACK = RefValue{Ptr{Cvoid}}(C_NULL) -const MATCH_CONTEXT = RefValue{Ptr{Cvoid}}(C_NULL) +function create_match_context() + JIT_STACK_START_SIZE = 32768 + JIT_STACK_MAX_SIZE = 1048576 + jit_stack = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Cvoid}, + (Cint, Cint, Ptr{Cvoid}), + JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL) + ctx = ccall((:pcre2_match_context_create_8, PCRE_LIB), + Ptr{Cvoid}, (Ptr{Cvoid},), C_NULL) + ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Cvoid, + (Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}), ctx, C_NULL, jit_stack) + return ctx +end -function __init__() - try - JIT_STACK_START_SIZE = 32768 - JIT_STACK_MAX_SIZE = 1048576 - JIT_STACK[] = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Cvoid}, - (Cint, Cint, Ptr{Cvoid}), - JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL) - MATCH_CONTEXT[] = ccall((:pcre2_match_context_create_8, PCRE_LIB), - Ptr{Cvoid}, (Ptr{Cvoid},), C_NULL) - ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Cvoid, - (Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}), MATCH_CONTEXT[], C_NULL, JIT_STACK[]) - catch ex - Base.showerror_nostdio(ex, - "WARNING: Error during initialization of module PCRE") +const THREAD_MATCH_CONTEXTS = Ptr{Cvoid}[C_NULL] + +_tid() = Int(ccall(:jl_threadid, Int16, ())+1) +_nth() = Int(unsafe_load(cglobal(:jl_n_threads, Cint))) + +function get_local_match_context() + tid = _tid() + ctx = @inbounds THREAD_MATCH_CONTEXTS[tid] + if ctx == C_NULL + @inbounds THREAD_MATCH_CONTEXTS[tid] = ctx = create_match_context() end + return ctx +end + +function __init__() + resize!(THREAD_MATCH_CONTEXTS, _nth()) + fill!(THREAD_MATCH_CONTEXTS, C_NULL) end # supported options for different use cases @@ -87,12 +99,16 @@ function info(regex::Ptr{Cvoid}, what::Integer, ::Type{T}) where T buf[] end -function get_ovec(match_data) - ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t}, - (Ptr{Cvoid},), match_data) +function ovec_length(match_data) n = ccall((:pcre2_get_ovector_count_8, PCRE_LIB), UInt32, (Ptr{Cvoid},), match_data) - unsafe_wrap(Array, ptr, 2n, own = false) + return 2n +end + +function ovec_ptr(match_data) + ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t}, + (Ptr{Cvoid},), match_data) + return ptr end function compile(pattern::AbstractString, options::Integer) @@ -132,15 +148,28 @@ function err_message(errno) GC.@preserve buffer unsafe_string(pointer(buffer)) end -function exec(re,subject,offset,options,match_data) +function exec(re, subject, offset, options, match_data) rc = ccall((:pcre2_match_8, PCRE_LIB), Cint, (Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, Cuint, Ptr{Cvoid}, Ptr{Cvoid}), - re, subject, sizeof(subject), offset, options, match_data, MATCH_CONTEXT[]) + re, subject, sizeof(subject), offset, options, match_data, get_local_match_context()) # rc == -1 means no match, -2 means partial match. rc < -2 && error("PCRE.exec error: $(err_message(rc))") rc >= 0 end +function exec_r(re, subject, offset, options) + match_data = create_match_data(re) + ans = exec(re, subject, offset, options, match_data) + free_match_data(match_data) + return ans +end + +function exec_r_data(re, subject, offset, options) + match_data = create_match_data(re) + ans = exec(re, subject, offset, options, match_data) + return ans, match_data +end + function create_match_data(re) ccall((:pcre2_match_data_create_from_pattern_8, PCRE_LIB), Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}), re, C_NULL) diff --git a/base/regex.jl b/base/regex.jl index 17eaa07e893e7..1e5cb3c869731 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -22,9 +22,6 @@ mutable struct Regex compile_options::UInt32 match_options::UInt32 regex::Ptr{Cvoid} - extra::Ptr{Cvoid} - ovec::Vector{Csize_t} - match_data::Ptr{Cvoid} function Regex(pattern::AbstractString, compile_options::Integer, match_options::Integer) @@ -37,11 +34,9 @@ mutable struct Regex if (match_options & ~PCRE.EXECUTE_MASK) !=0 throw(ArgumentError("invalid regex match options: $match_options")) end - re = compile(new(pattern, compile_options, match_options, C_NULL, - C_NULL, Csize_t[], C_NULL)) + re = compile(new(pattern, compile_options, match_options, C_NULL)) finalizer(re) do re re.regex == C_NULL || PCRE.free_re(re.regex) - re.match_data == C_NULL || PCRE.free_match_data(re.match_data) end re end @@ -68,8 +63,6 @@ function compile(regex::Regex) if regex.regex == C_NULL regex.regex = PCRE.compile(regex.pattern, regex.compile_options) PCRE.jit_compile(regex.regex) - regex.match_data = PCRE.create_match_data(regex.regex) - regex.ovec = PCRE.get_ovec(regex.match_data) end regex end @@ -164,14 +157,12 @@ getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)] function occursin(r::Regex, s::AbstractString; offset::Integer=0) compile(r) - return PCRE.exec(r.regex, String(s), offset, r.match_options, - r.match_data) + return PCRE.exec_r(r.regex, String(s), offset, r.match_options) end function occursin(r::Regex, s::SubString; offset::Integer=0) compile(r) - return PCRE.exec(r.regex, s, offset, r.match_options, - r.match_data) + return PCRE.exec_r(r.regex, s, offset, r.match_options) end """ @@ -198,14 +189,12 @@ true """ function startswith(s::AbstractString, r::Regex) compile(r) - return PCRE.exec(r.regex, String(s), 0, r.match_options | PCRE.ANCHORED, - r.match_data) + return PCRE.exec_r(r.regex, String(s), 0, r.match_options | PCRE.ANCHORED) end function startswith(s::SubString, r::Regex) compile(r) - return PCRE.exec(r.regex, s, 0, r.match_options | PCRE.ANCHORED, - r.match_data) + return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ANCHORED) end """ @@ -232,14 +221,12 @@ true """ function endswith(s::AbstractString, r::Regex) compile(r) - return PCRE.exec(r.regex, String(s), 0, r.match_options | PCRE.ENDANCHORED, - r.match_data) + return PCRE.exec_r(r.regex, String(s), 0, r.match_options | PCRE.ENDANCHORED) end function endswith(s::SubString, r::Regex) compile(r) - return PCRE.exec(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED, - r.match_data) + return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED) end """ @@ -274,17 +261,21 @@ function match end function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, add_opts::UInt32=UInt32(0)) compile(re) opts = re.match_options | add_opts - if !PCRE.exec(re.regex, str, idx-1, opts, re.match_data) + matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts) + if !matched + PCRE.free_match_data(data) return nothing end - ovec = re.ovec - n = div(length(ovec),2) - 1 - mat = SubString(str, ovec[1]+1, prevind(str, ovec[2]+1)) - cap = Union{Nothing,SubString{String}}[ovec[2i+1] == PCRE.UNSET ? nothing : - SubString(str, ovec[2i+1]+1, - prevind(str, ovec[2i+2]+1)) for i=1:n] - off = Int[ ovec[2i+1]+1 for i=1:n ] - RegexMatch(mat, cap, ovec[1]+1, off, re) + n = div(PCRE.ovec_length(data), 2) - 1 + p = PCRE.ovec_ptr(data) + mat = SubString(str, unsafe_load(p, 1)+1, prevind(str, unsafe_load(p, 2)+1)) + cap = Union{Nothing,SubString{String}}[unsafe_load(p,2i+1) == PCRE.UNSET ? nothing : + SubString(str, unsafe_load(p,2i+1)+1, + prevind(str, unsafe_load(p,2i+2)+1)) for i=1:n] + off = Int[ unsafe_load(p,2i+1)+1 for i=1:n ] + result = RegexMatch(mat, cap, unsafe_load(p,1)+1, off, re) + PCRE.free_match_data(data) + return result end match(r::Regex, s::AbstractString) = match(r, s, firstindex(s)) @@ -292,18 +283,30 @@ match(r::Regex, s::AbstractString, i::Integer) = throw(ArgumentError( "regex matching is only available for the String type; use String(s) to convert" )) +findnext(re::Regex, str::Union{String,SubString}, idx::Integer) = _findnext_re(re, str, idx, C_NULL) + # TODO: return only start index and update deprecation -function findnext(re::Regex, str::Union{String,SubString}, idx::Integer) +function _findnext_re(re::Regex, str::Union{String,SubString}, idx::Integer, match_data::Ptr{Cvoid}) if idx > nextind(str,lastindex(str)) throw(BoundsError()) end opts = re.match_options compile(re) - if PCRE.exec(re.regex, str, idx-1, opts, re.match_data) - (Int(re.ovec[1])+1):prevind(str,Int(re.ovec[2])+1) + alloc = match_data == C_NULL + if alloc + matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts) + else + matched = PCRE.exec(re.regex, str, idx-1, opts, match_data) + data = match_data + end + if matched + p = PCRE.ovec_ptr(data) + ans = (Int(unsafe_load(p,1))+1):prevind(str,Int(unsafe_load(p,2))+1) else - nothing + ans = nothing end + alloc && PCRE.free_match_data(data) + return ans end findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError( "regex search is only available for the String type; use String(s) to convert" @@ -384,9 +387,23 @@ julia> replace(msg, r"#(.+)# from (?\\w+)" => s"FROM: \\g; MESSAGE: """ macro s_str(string) SubstitutionString(string) end +# replacement + +struct RegexAndMatchData + re::Regex + match_data::Ptr{Cvoid} + RegexAndMatchData(re::Regex) = (compile(re); new(re, PCRE.create_match_data(re.regex))) +end + +findnext(pat::RegexAndMatchData, str, i) = _findnext_re(pat.re, str, i, pat.match_data) + +_pat_replacer(r::Regex) = RegexAndMatchData(r) + +_free_pat_replacer(r::RegexAndMatchData) = PCRE.free_match_data(r.match_data) + replace_err(repl) = error("Bad replacement string: $repl") -function _write_capture(io, re, group) +function _write_capture(io, re::RegexAndMatchData, group) len = PCRE.substring_length_bynumber(re.match_data, group) ensureroom(io, len+1) PCRE.substring_copy_bynumber(re.match_data, group, @@ -395,7 +412,7 @@ function _write_capture(io, re, group) io.size = max(io.size, io.ptr - 1) end -function _replace(io, repl_s::SubstitutionString, str, r, re) +function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) SUB_CHAR = '\\' GROUP_CHAR = 'g' LBRACKET = '<' @@ -439,8 +456,8 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) if all(isdigit, groupname) _write_capture(io, re, parse(Int, groupname)) else - group = PCRE.substring_number_from_name(re.regex, groupname) - group < 0 && replace_err("Group $groupname not found in regex $re") + group = PCRE.substring_number_from_name(re.re.regex, groupname) + group < 0 && replace_err("Group $groupname not found in regex $(re.re)") _write_capture(io, re, group) end i = nextind(repl, i) diff --git a/base/strings/util.jl b/base/strings/util.jl index b97a76a24b3e8..e0537ba3d6b0e 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -426,6 +426,9 @@ replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{<:AbstractChar}}, count::Integer=typemax(Int)) = replace(str, in(first(pat_repl)) => last(pat_repl), count=count) +_pat_replacer(x) = x +_free_pat_replacer(x) = nothing + function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int)) pattern, repl = pat_repl count == 0 && return str @@ -433,6 +436,7 @@ function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int)) n = 1 e = lastindex(str) i = a = firstindex(str) + pattern = _pat_replacer(pattern) r = something(findnext(pattern,str,i), 0) j, k = first(r), last(r) out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str))) @@ -453,6 +457,7 @@ function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int)) j, k = first(r), last(r) n += 1 end + _free_pat_replacer(pattern) write(out, SubString(str,i)) String(take!(out)) end