diff --git a/base/strings/string.jl b/base/strings/string.jl index ac1403f01a4a1..9716d06deefdf 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -98,6 +98,7 @@ String(s::AbstractString) = print_to_string(s) @assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) +unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s)) Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s)) @@ -191,15 +192,201 @@ end end ## checking UTF-8 & ACSII validity ## +#= + The UTF-8 Validation is performed by a shift based DFA. + ┌───────────────────────────────────────────────────────────────────┐ + │ UTF-8 DFA State Diagram ┌──────────────2──────────────┐ │ + │ ├────────3────────┐ │ │ + │ ┌──────────┐ │ ┌─┐ ┌▼┐ │ │ + │ ASCII │ UTF-8 │ ├─5──►│9├───1────► │ │ │ + │ │ │ │ ├─┤ │ │ ┌▼┐ │ + │ │ ┌─0─┐ │ ├─6──►│8├─1,7,9──►4├──1,7,9──► │ │ + │ ┌─0─┐ │ │ │ │ │ ├─┤ │ │ │ │ │ + │ │ │ │ ┌▼───┴┐ │ ├─11─►│7├──7,9───► │ ┌───────►3├─┐ │ + │ ┌▼───┴┐ │ │ │ ▼ │ └─┘ └─┘ │ │ │ │ │ + │ │ 0 ├─────┘ │ 1 ├─► ──┤ │ ┌────► │ │ │ + │ └─────┘ │ │ │ ┌─┐ │ │ └─┘ │ │ + │ └──▲──┘ ├─10─►│5├─────7──────┘ │ │ │ + │ │ │ ├─┤ │ │ │ + │ │ └─4──►│6├─────1,9───────┘ │ │ + │ INVALID │ └─┘ │ │ + │ ┌─*─┐ └──────────────────1,7,9──────────────────┘ │ + │ ┌▼───┴┐ │ + │ │ 2 ◄─── All undefined transitions result in state 2 │ + │ └─────┘ │ + └───────────────────────────────────────────────────────────────────┘ + + Validation States + 0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters + If the DFA ends in this state the string is ASCII only + 1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character + 2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change + as seen by all 1s in that column of table below + 3 -> One valid continuation byte needed to return to state 0 + 4,5,6 -> Two valid continuation bytes needed to return to state 0 + 7,8,9 -> Three valids continuation bytes needed to return to state 0 + + Current State + 0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲ + 0 | 0 1 2 2 2 2 2 2 2 2 + 1 | 2 2 2 1 3 2 3 2 4 4 + 2 | 3 3 2 2 2 2 2 2 2 2 + 3 | 4 4 2 2 2 2 2 2 2 2 + 4 | 6 6 2 2 2 2 2 2 2 2 + Character 5 | 9 9 2 2 2 2 2 2 2 2 <- Next State + Class 6 | 8 8 2 2 2 2 2 2 2 2 + 7 | 2 2 2 1 3 3 2 4 4 2 + 8 | 2 2 2 2 2 2 2 2 2 2 + 9 | 2 2 2 1 3 2 3 4 4 2 + 10 | 5 5 2 2 2 2 2 2 2 2 + 11 | 7 7 2 2 2 2 2 2 2 2 + + Shifts | 0 4 10 14 18 24 8 20 12 26 + + The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into + the rows the correct shift was a result. + + Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by + the current state then masking the result with 0x11110 give the shift for the new state + + +=# + +#State type used by UTF-8 DFA +const _UTF8DFAState = UInt32 +# Fill the table with 256 UInt64 representing the DFA transitions for all bytes +const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base + num_classes=12 + num_states=10 + bit_per_state = 6 + + # These shifts were derived using a SMT solver + state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26] + + character_classes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ] + + # These are the rows discussed in comments above + state_arrays = [ 0 1 2 2 2 2 2 2 2 2; + 2 2 2 1 3 2 3 2 4 4; + 3 3 2 2 2 2 2 2 2 2; + 4 4 2 2 2 2 2 2 2 2; + 6 6 2 2 2 2 2 2 2 2; + 9 9 2 2 2 2 2 2 2 2; + 8 8 2 2 2 2 2 2 2 2; + 2 2 2 1 3 3 2 4 4 2; + 2 2 2 2 2 2 2 2 2 2; + 2 2 2 1 3 2 3 4 4 2; + 5 5 2 2 2 2 2 2 2 2; + 7 7 2 2 2 2 2 2 2 2] + + #This converts the state_arrays into the shift encoded _UTF8DFAState + class_row = zeros(_UTF8DFAState, num_classes) + + for i = 1:num_classes + row = _UTF8DFAState(0) + for j in 1:num_states + #Calculate the shift required for the next state + to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) ) + #Shift the next state into the position of the current state + row = row | (_UTF8DFAState(to_shift) << state_shifts[j]) + end + class_row[i]=row + end + + map(c->class_row[c+1],character_classes) +end + + +const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string +const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string +const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop + +# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above +@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E) + +@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes)) + for i = first:last + @inbounds state = _utf_dfa_step(state, bytes[i]) + end + return (state) +end + +@inline function _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU} + n=first + while n <= last - chunk_size + _isascii(cu,n,n+chunk_size-1) || return n + n += chunk_size + end + n= last-chunk_size+1 + _isascii(cu,n,last) || return n + return nothing +end + +## -byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) +# Classifcations of string # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 + byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s)) + + +function byte_string_classify(bytes::AbstractVector{UInt8}) + chunk_size = 1024 + chunk_threshold = chunk_size + (chunk_size ÷ 2) + n = length(bytes) + if n > chunk_threshold + start = _find_nonascii_chunk(chunk_size,bytes,1,n) + isnothing(start) && return 1 + else + _isascii(bytes,1,n) && return 1 + start = 1 + end + return _byte_string_classify_nonascii(bytes,start,n) +end + +function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int) + chunk_size = 256 + + start = first + stop = min(last,first + chunk_size - 1) + state = _UTF8_DFA_ACCEPT + while start <= last + # try to process ascii chunks + while state == _UTF8_DFA_ACCEPT + _isascii(bytes,start,stop) || break + (start = start + chunk_size) <= last || break + stop = min(last,stop + chunk_size) + end + # Process non ascii chunk + state = _isvalid_utf8_dfa(state,bytes,start,stop) + state == _UTF8_DFA_INVALID && return 0 + + start = start + chunk_size + stop = min(last,stop + chunk_size) + end + return ifelse(state == _UTF8_DFA_ACCEPT,2,0) +end + +isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0 +isvalid(::Type{String}, s::AbstractString) = (@inline byte_string_classify(s)) ≠ 0 -isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 -isvalid(s::String) = isvalid(String, s) +@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s)) is_valid_continuation(c) = c & 0xc0 == 0x80 diff --git a/base/strings/substring.jl b/base/strings/substring.jl index ea132402447be..5ba08ac2f7fff 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -100,12 +100,6 @@ function isvalid(s::SubString, i::Integer) @inbounds return ib && isvalid(s.string, s.offset + i)::Bool end -byte_string_classify(s::SubString{String}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) - -isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0 -isvalid(s::SubString{String}) = isvalid(String, s) - thisind(s::SubString{String}, i::Int) = _thisind_str(s, i) nextind(s::SubString{String}, i::Int) = _nextind_str(s, i) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index e1d6e9dd60491..602c38551f6d8 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -1234,3 +1234,155 @@ end end @test_throws ArgumentError Symbol("a\0a") end + +@testset "Ensure UTF-8 DFA can never leave invalid state" begin + for b = typemin(UInt8):typemax(UInt8) + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID + end +end +@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin + for b = 0x00:0x7F + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII + end +end + +@testset "Validate UTF-8 DFA" begin + # Unicode 15 + # Table 3-7. Well-Formed UTF-8 Byte Sequences + + table_rows = [ [0x00:0x7F], + [0xC2:0xDF,0x80:0xBF], + [0xE0:0xE0,0xA0:0xBF,0x80:0xBF], + [0xE1:0xEC,0x80:0xBF,0x80:0xBF], + [0xED:0xED,0x80:0x9F,0x80:0xBF], + [0xEE:0xEF,0x80:0xBF,0x80:0xBF], + [0xF0:0xF0,0x90:0xBF,0x80:0xBF,0x80:0xBF], + [0xF1:0xF3,0x80:0xBF,0x80:0xBF,0x80:0xBF], + [0xF4:0xF4,0x80:0x8F,0x80:0xBF,0x80:0xBF]] + invalid_first_bytes = union(0xC0:0xC1,0xF5:0xFF,0x80:0xBF) + + valid_first_bytes = union(collect(first(r) for r in table_rows)...) + + + + # Prove that the first byte sets in the table & invalid cover all bytes + @test length(union(valid_first_bytes,invalid_first_bytes)) == 256 + @test length(intersect(valid_first_bytes,invalid_first_bytes)) == 0 + + #Check the ASCII range + for b = 0x00:0x7F + #Test from both UTF-8 state and ascii state + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_ACCEPT + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII + end + + #Check the remaining first bytes + for b = 0x80:0xFF + if b ∈ invalid_first_bytes + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_INVALID + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_INVALID + else + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) != Base._UTF8_DFA_INVALID + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) != Base._UTF8_DFA_INVALID + end + end + + # Check two byte Sequences + for table_row in [table_rows[2]] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + b1 = first(table_row[1]) + #Prove that all valid second bytes return correct state + for b2 = table_row[2] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + end + + # Check three byte Sequences + for table_row in table_rows[3:6] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + + b1 = first(table_row[1]) + b2 = first(table_row[2]) + #Prove that all valid second bytes return same state + state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1) + for b2 = table_row[2] + @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + + b2 = first(table_row[2]) + #Prove that all valid third bytes return correct state + for b3 = table_row[3] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + for b3 = setdiff(0x00:0xFF,table_row[3]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + end + + # Check Four byte Sequences + for table_row in table_rows[7:9] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + + b1 = first(table_row[1]) + b2 = first(table_row[2]) + #Prove that all valid second bytes return same state + state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1) + for b2 = table_row[2] + @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + + + b2 = first(table_row[2]) + b3 = first(table_row[3]) + state3 = Base._isvalid_utf8_dfa(state2,[b3],1,1) + #Prove that all valid third bytes return same state + for b3 = table_row[3] + @test state3 == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + for b3 = setdiff(0x00:0xFF,table_row[3]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + + b3 = first(table_row[3]) + #Prove that all valid forth bytes return correct state + for b4 = table_row[4] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state3,[b4],1,1) + end + for b4 = setdiff(0x00:0xFF,table_row[4]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1) + end + end +end