From a82388019c940668c33ba187d675f97ac29f1768 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 12 Dec 2022 16:52:20 -0500 Subject: [PATCH 01/34] Working Native UTF-8 Validation --- base/strings/string.jl | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index ac1403f01a4a1..cd1c11a3543c0 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -191,13 +191,47 @@ end end ## checking UTF-8 & ACSII validity ## +# This table is used by the shift base DFA validation for UTF-8 +const dfa_table = [ + fill(UInt64(384), 128); + fill(UInt64(6860978528477184), 16); + fill(UInt64(107228354863104), 16); + fill(UInt64(107202588205056), 32); + fill(UInt64(0), 2); + fill(UInt64(768), 30); + fill(UInt64(1152), 1); + fill(UInt64(1536), 12); + fill(UInt64(1920), 1); + fill(UInt64(1536), 2); + fill(UInt64(2304), 1); + fill(UInt64(2688), 3); + fill(UInt64(3072), 1); + fill(UInt64(0), 11) +]::Vector{UInt64} + +# This is a shift based utf-8 DFA +function _isvalid_utf8(bytes::Vector{UInt8}) + f(byte) = @inbounds dfa_table[byte] + op(state, byte_dfa) = byte_dfa >> (state & UInt64(63)) + return mapfoldl(f, op, bytes, init=UInt64(6)) == UInt64(6) +end + +_isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) -byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) +#Classifcations of string # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 +function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; ascii_fasttrack = true ) + bytes = unsafe_wrap(Vector{UInt8}, s) + ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1 + valid = _isvalid_utf8(bytes) + return ifelse(valid, 2, 0) +end +# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that +# the benefit doesn't show up. It would also remove the ascii fast track that is faster for inputs that are all ascii +# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 isvalid(s::String) = isvalid(String, s) From 92a72d72ba5ade0494d7e48a46f37f994a671e3a Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 12 Dec 2022 17:28:45 -0500 Subject: [PATCH 02/34] Comment fix --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index cd1c11a3543c0..0c392dd739b4f 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -218,7 +218,7 @@ end _isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) -#Classifcations of string +# Classifcations of string # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 From 34a937d8e6a1d98ce30463d4787bb3d0c84b65bf Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 10:13:46 -0500 Subject: [PATCH 03/34] Appears working --- base/strings/string.jl | 48 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 0c392dd739b4f..4bfec6eed53c6 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -98,6 +98,7 @@ String(s::AbstractString) = print_to_string(s) @assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) +unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s)) Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s)) @@ -192,44 +193,49 @@ end ## checking UTF-8 & ACSII validity ## # This table is used by the shift base DFA validation for UTF-8 -const dfa_table = [ - fill(UInt64(384), 128); - fill(UInt64(6860978528477184), 16); - fill(UInt64(107228354863104), 16); - fill(UInt64(107202588205056), 32); - fill(UInt64(0), 2); - fill(UInt64(768), 30); - fill(UInt64(1152), 1); - fill(UInt64(1536), 12); - fill(UInt64(1920), 1); - fill(UInt64(1536), 2); - fill(UInt64(2304), 1); - fill(UInt64(2688), 3); - fill(UInt64(3072), 1); - fill(UInt64(0), 11) -]::Vector{UInt64} +const dfa_table = [ + fill(UInt64(109802048057794944), 128); + fill(UInt64(113232530780455302), 16); + fill(UInt64(109855655693648262), 16); + fill(UInt64(109855649351860614), 32); + fill(UInt64(109802048057794950), 2); + fill(UInt64(109802048057794956), 30); + fill(UInt64(109802048057794968), 1); + fill(UInt64(109802048057794962), 12); + fill(UInt64(109802048057794974), 1); + fill(UInt64(109802048057794962), 2); + fill(UInt64(109802048057794980), 1); + fill(UInt64(109802048057794986), 3); + fill(UInt64(109802048057794992), 1); + fill(UInt64(109802048057794950), 11) + ]::Vector{UInt64} # This is a shift based utf-8 DFA function _isvalid_utf8(bytes::Vector{UInt8}) - f(byte) = @inbounds dfa_table[byte] + f(byte) = @inbounds dfa_table[byte+1] op(state, byte_dfa) = byte_dfa >> (state & UInt64(63)) - return mapfoldl(f, op, bytes, init=UInt64(6)) == UInt64(6) + final_state = mapfoldl(f, op, bytes, init = UInt64(0)) + return (final_state & UInt64(63)) == UInt64(0) end -_isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) +_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) # Classifcations of string # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 -function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; ascii_fasttrack = true ) +function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...) bytes = unsafe_wrap(Vector{UInt8}, s) + byte_string_classify(bytes, kwargs...) +end + +function byte_string_classify(bytes::Vector{UInt8}; ascii_fasttrack = true ) ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1 valid = _isvalid_utf8(bytes) return ifelse(valid, 2, 0) end -# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that +# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that # the benefit doesn't show up. It would also remove the ascii fast track that is faster for inputs that are all ascii # isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 From 2affea139f26fb25db899ed512096b4f93a905fa Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 11:45:08 -0500 Subject: [PATCH 04/34] Slight Fix and push for buildkite --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 4bfec6eed53c6..23e3b680100bb 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -224,7 +224,7 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 -function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...) +function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...) bytes = unsafe_wrap(Vector{UInt8}, s) byte_string_classify(bytes, kwargs...) end From 21b52aacce6a47e842c8d383f197df45c664bccd Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 12:39:55 -0500 Subject: [PATCH 05/34] Spit out statemachine and added comments --- base/strings/string.jl | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 23e3b680100bb..845dc32045f89 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -193,7 +193,7 @@ end ## checking UTF-8 & ACSII validity ## # This table is used by the shift base DFA validation for UTF-8 -const dfa_table = [ +const _UTF8_DFA_TABLE = [ fill(UInt64(109802048057794944), 128); fill(UInt64(113232530780455302), 16); fill(UInt64(109855655693648262), 16); @@ -210,12 +210,25 @@ const dfa_table = [ fill(UInt64(109802048057794950), 11) ]::Vector{UInt64} -# This is a shift based utf-8 DFA +const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string +const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop + +# This function is designed so that you could use it on strings with discontinous memmory layouts +# by only feeding it contiguous block and keeping track of the state inbetween. +# Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was. +# For a contiguous bytestream other states are valid other than _UTF8_DFA_ACCEPT aslong as you aren't +# at the begining or end +function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) + f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1] + op(s, byte_dfa) = byte_dfa >> (s & UInt64(63)) + final_state = mapfoldl(f, op, bytes, init = state) + return (final_state & UInt64(63)) +end + +# This is a shift based utf-8 DFA that works on string that are a contiguous block function _isvalid_utf8(bytes::Vector{UInt8}) - f(byte) = @inbounds dfa_table[byte+1] - op(state, byte_dfa) = byte_dfa >> (state & UInt64(63)) - final_state = mapfoldl(f, op, bytes, init = UInt64(0)) - return (final_state & UInt64(63)) == UInt64(0) + final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT) + return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT end _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) @@ -237,8 +250,8 @@ end # The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that # the benefit doesn't show up. It would also remove the ascii fast track that is faster for inputs that are all ascii -# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) -isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 +# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = _isvalid_utf8(s) + isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 isvalid(s::String) = isvalid(String, s) is_valid_continuation(c) = c & 0xc0 == 0x80 From 57a4d2ab5ab4a58136e58b82e330cd885a565327 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 13:23:51 -0500 Subject: [PATCH 06/34] Remove Fastpath f& simplify isvalid --- base/strings/string.jl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 845dc32045f89..6f6b59616cee5 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -237,21 +237,26 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 -function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...) +function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) bytes = unsafe_wrap(Vector{UInt8}, s) byte_string_classify(bytes, kwargs...) end -function byte_string_classify(bytes::Vector{UInt8}; ascii_fasttrack = true ) - ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1 +function byte_string_classify(bytes::Vector{UInt8}) + all(c -> iszero(c & 0x80), bytes) && return 1 valid = _isvalid_utf8(bytes) return ifelse(valid, 2, 0) end -# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that -# the benefit doesn't show up. It would also remove the ascii fast track that is faster for inputs that are all ascii -# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = _isvalid_utf8(s) - isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 +function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) + bytes = unsafe_wrap(Vector{UInt8}, s) + isvalid(String,bytes) +end +function isvalid(::Type{String}, bytes::Vector{UInt8}) + valid = _isvalid_utf8(bytes) + return ifelse(valid, true, false) +end + isvalid(s::String) = isvalid(String, s) is_valid_continuation(c) = c & 0xc0 == 0x80 From 877ba93953bcf66f73b471f811118a3507d448a7 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 18:21:46 -0500 Subject: [PATCH 07/34] Minor fixes and mega comment on methodolgy --- base/strings/string.jl | 94 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 6 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 6f6b59616cee5..f3b62d82825cd 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -192,7 +192,92 @@ end end ## checking UTF-8 & ACSII validity ## -# This table is used by the shift base DFA validation for UTF-8 +#= + The UTF-8 Validation is performed by a shift based DFA. + Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + + Important States + 0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well + ASCII only strings will never leave this state + 1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not + 2 -> This is the state before the last byte of a multibyte character is read + 9 -> Not important and not used which is why it is all ones + Current State + 0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲ + 0 | 0 1 1 1 1 1 1 1 1 1 + 1 | 1 1 0 2 1 2 1 3 3 1 + 2 | 2 1 1 1 1 1 1 1 1 1 + 3 | 3 1 1 1 1 1 1 1 1 1 + 4 | 5 1 1 1 1 1 1 1 1 1 + Character 5 | 8 1 1 1 1 1 1 1 1 1 + Class 6 | 7 1 1 1 1 1 1 1 1 1 + 7 | 1 1 0 2 2 1 3 3 1 1 + 8 | 1 1 1 1 1 1 1 1 1 1 + 9 | 1 1 0 2 1 2 3 3 1 1 + 10 | 1 1 1 1 1 1 1 1 1 1 + 11 | 6 1 1 1 1 1 1 1 1 1 + + Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that + it contains the number of bit needed to shift the state it is transitioning to shifted into + the position of the current state. + + Example: character class 1 is encoded in below + Current State | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + Next State | 1 | 3 | 3 | 1 | 2 | 1 | 2 | 0 | 1 | 1 | + Shift required | 6*1 | 6*3 | 6*3 | 6*1 | 6*2 | 6*1 | 6*2 | 6*0 | 6*1 | 6*1 | + | 6 | 18 | 18 | 6 | 12 | 6 | 12 | 0 | 6 | 6 | + UInt64(113232530780455302) = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 + + Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30 + so when the next character class is 7 row is in row::UInt64: + The reduction operation: + state = byte_dfa >> (state & UInt64(63)) + | Shift to get the next state shift | Mask first 6 bits of starting state to get the current shift ie 30 + Would result in the state being 2 which is a shift of 12: + state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30 + state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100| + + The code below will create the _UTF8_DFA_TABLE to be pasted in source. + It is included here in an effort to document a contrived process. + Do Not Uncomment the code below in this file it should be pasted into REPL + + function build_utf8_validation_statemachine_table(; num_classes=12, num_states=10, bit_per_state = 6) + + # class_repeats represents the 256 byte's classes by storing the (class, #of repeats) + class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1), + (3, 12), (4, 1), (3, 2), (11, 1), (6, 3), (5, 1), (8, 11)] + + # See discription above + state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1], + [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1], + [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1], + [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]] + #This converts the state_arrays into the shift encoded UInt64 + class_row = zeros(UInt64, num_classes) + for i = 1:num_classes + row = UInt64(0) + for j in 1:num_states + to_shift = UInt8((state_arrays[i][j]) * bit_per_state) + row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state)) + end + class_row[i]=row + end + print("\nconst _UTF8_DFA_TABLE = [\n") + for (class, repeats) in class_repeats + print(" fill(UInt64($(class_row[class+1])), $repeats);\n") + end + print(" ]\n") + end +=# +# This table will be filled with 256 UInt64 representing the DFA transitions for all bytes const _UTF8_DFA_TABLE = [ fill(UInt64(109802048057794944), 128); fill(UInt64(113232530780455302), 16); @@ -208,7 +293,7 @@ const _UTF8_DFA_TABLE = [ fill(UInt64(109802048057794986), 3); fill(UInt64(109802048057794992), 1); fill(UInt64(109802048057794950), 11) - ]::Vector{UInt64} + ] const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop @@ -252,10 +337,7 @@ function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{ bytes = unsafe_wrap(Vector{UInt8}, s) isvalid(String,bytes) end -function isvalid(::Type{String}, bytes::Vector{UInt8}) - valid = _isvalid_utf8(bytes) - return ifelse(valid, true, false) -end +isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes) isvalid(s::String) = isvalid(String, s) From 0c4b348234fe9b4f8da693a4ed5af65f8d9fbff1 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 18:24:59 -0500 Subject: [PATCH 08/34] Comment --- base/strings/string.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index f3b62d82825cd..ba16561415da7 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -299,10 +299,8 @@ const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of a const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop # This function is designed so that you could use it on strings with discontinous memmory layouts -# by only feeding it contiguous block and keeping track of the state inbetween. +# by only feeding it contiguous block and keeping track of the state inbetween. # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was. -# For a contiguous bytestream other states are valid other than _UTF8_DFA_ACCEPT aslong as you aren't -# at the begining or end function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1] op(s, byte_dfa) = byte_dfa >> (s & UInt64(63)) From b6b25c7c6248860902608110ca38997d13cfa7d7 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 13 Dec 2022 18:56:51 -0500 Subject: [PATCH 09/34] whitespaces --- base/strings/string.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index ba16561415da7..a1743929b6f80 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -216,7 +216,7 @@ end 9 | 1 1 0 2 1 2 3 3 1 1 10 | 1 1 1 1 1 1 1 1 1 1 11 | 6 1 1 1 1 1 1 1 1 1 - + Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that it contains the number of bit needed to shift the state it is transitioning to shifted into the position of the current state. @@ -229,7 +229,7 @@ end UInt64(113232530780455302) = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30 - so when the next character class is 7 row is in row::UInt64: + so when the next character class is 7 row is in row::UInt64: The reduction operation: state = byte_dfa >> (state & UInt64(63)) | Shift to get the next state shift | Mask first 6 bits of starting state to get the current shift ie 30 @@ -299,13 +299,13 @@ const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of a const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop # This function is designed so that you could use it on strings with discontinous memmory layouts -# by only feeding it contiguous block and keeping track of the state inbetween. +# by only feeding it contiguous block and keeping track of the state inbetween. # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was. function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1] op(s, byte_dfa) = byte_dfa >> (s & UInt64(63)) final_state = mapfoldl(f, op, bytes, init = state) - return (final_state & UInt64(63)) + return (final_state & UInt64(63)) end # This is a shift based utf-8 DFA that works on string that are a contiguous block @@ -331,7 +331,7 @@ function byte_string_classify(bytes::Vector{UInt8}) return ifelse(valid, 2, 0) end -function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) +function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) bytes = unsafe_wrap(Vector{UInt8}, s) isvalid(String,bytes) end From a019b68bd9c053271c4fd45ef8094bbe2cc78815 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Wed, 14 Dec 2022 09:40:22 -0500 Subject: [PATCH 10/34] Additional state comments --- base/strings/string.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index a1743929b6f80..958d2eca13167 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -196,11 +196,13 @@ end The UTF-8 Validation is performed by a shift based DFA. Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - Important States + Validation States 0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well ASCII only strings will never leave this state 1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not - 2 -> This is the state before the last byte of a multibyte character is read + 2 -> One valid continuation byte needed to return to state 0 + 3,4,5 -> Two valid continuation bytes needed to return to state 0 + 6,7,8 -> Three valids continuation bytes needed to return to state 0 9 -> Not important and not used which is why it is all ones Current State 0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲ From 5d808260a8e500ba5dba52c7029ed75da28bd15c Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Sun, 18 Dec 2022 17:11:54 -0500 Subject: [PATCH 11/34] Fix Comment --- base/strings/string.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 958d2eca13167..8429642e17dca 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -211,12 +211,12 @@ end 2 | 2 1 1 1 1 1 1 1 1 1 3 | 3 1 1 1 1 1 1 1 1 1 4 | 5 1 1 1 1 1 1 1 1 1 - Character 5 | 8 1 1 1 1 1 1 1 1 1 + Character 5 | 8 1 1 1 1 1 1 1 1 1 <- Next State Class 6 | 7 1 1 1 1 1 1 1 1 1 7 | 1 1 0 2 2 1 3 3 1 1 8 | 1 1 1 1 1 1 1 1 1 1 9 | 1 1 0 2 1 2 3 3 1 1 - 10 | 1 1 1 1 1 1 1 1 1 1 + 10 | 4 1 1 1 1 1 1 1 1 1 11 | 6 1 1 1 1 1 1 1 1 1 Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that @@ -260,7 +260,7 @@ end [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1], [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1], - [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1], [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]] #This converts the state_arrays into the shift encoded UInt64 class_row = zeros(UInt64, num_classes) From ffd8a1a224930ec6bdeb313e21b6abbfaccc8f10 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Sun, 18 Dec 2022 18:07:11 -0500 Subject: [PATCH 12/34] Change table definition to let block --- base/strings/string.jl | 68 ++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 8429642e17dca..a951f53e7f611 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -280,22 +280,58 @@ end end =# # This table will be filled with 256 UInt64 representing the DFA transitions for all bytes -const _UTF8_DFA_TABLE = [ - fill(UInt64(109802048057794944), 128); - fill(UInt64(113232530780455302), 16); - fill(UInt64(109855655693648262), 16); - fill(UInt64(109855649351860614), 32); - fill(UInt64(109802048057794950), 2); - fill(UInt64(109802048057794956), 30); - fill(UInt64(109802048057794968), 1); - fill(UInt64(109802048057794962), 12); - fill(UInt64(109802048057794974), 1); - fill(UInt64(109802048057794962), 2); - fill(UInt64(109802048057794980), 1); - fill(UInt64(109802048057794986), 3); - fill(UInt64(109802048057794992), 1); - fill(UInt64(109802048057794950), 11) - ] +const _UTF8_DFA_TABLE = let + num_classes=12 + num_states=10 + bit_per_state = 6 + # class_repeats represents the 256 byte's classes by storing the (class, #of repeats) + class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1), + (3, 12), (4, 1), (3, 2), (11, 1), (6, 3), (5, 1), (8, 11)] + + # These are the rows discussed in comments above + state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1], + [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1], + [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1], + [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]] + + #This converts the state_arrays into the shift encoded UInt64 + class_row = zeros(UInt64, num_classes) + for i = 1:num_classes + row = UInt64(0) + for j in 1:num_states + #Calculate the shift required for the next state + to_shift = UInt8((state_arrays[i][j]) * bit_per_state) + #Shift the next state into the position of the current state + row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state)) + end + class_row[i]=row + end + mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats) +end +# const _UTF8_DFA_TABLE = [ +# fill(UInt64(109802048057794944), 128); +# fill(UInt64(113232530780455302), 16); +# fill(UInt64(109855655693648262), 16); +# fill(UInt64(109855649351860614), 32); +# fill(UInt64(109802048057794950), 2); +# fill(UInt64(109802048057794956), 30); +# fill(UInt64(109802048057794968), 1); +# fill(UInt64(109802048057794962), 12); +# fill(UInt64(109802048057794974), 1); +# fill(UInt64(109802048057794962), 2); +# fill(UInt64(109802048057794980), 1); +# fill(UInt64(109802048057794986), 3); +# fill(UInt64(109802048057794992), 1); +# fill(UInt64(109802048057794950), 11) +# ] const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop From 4530d895759773d8600072b7b987c5e1eb326bf0 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Sun, 18 Dec 2022 22:26:00 -0500 Subject: [PATCH 13/34] Build table with let block --- base/strings/string.jl | 62 +++--------------------------------------- 1 file changed, 4 insertions(+), 58 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index a951f53e7f611..edb30d458e9ba 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -238,49 +238,10 @@ end Would result in the state being 2 which is a shift of 12: state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30 state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100| - - The code below will create the _UTF8_DFA_TABLE to be pasted in source. - It is included here in an effort to document a contrived process. - Do Not Uncomment the code below in this file it should be pasted into REPL - - function build_utf8_validation_statemachine_table(; num_classes=12, num_states=10, bit_per_state = 6) - - # class_repeats represents the 256 byte's classes by storing the (class, #of repeats) - class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1), - (3, 12), (4, 1), (3, 2), (11, 1), (6, 3), (5, 1), (8, 11)] - - # See discription above - state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1], - [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1], - [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1], - [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]] - #This converts the state_arrays into the shift encoded UInt64 - class_row = zeros(UInt64, num_classes) - for i = 1:num_classes - row = UInt64(0) - for j in 1:num_states - to_shift = UInt8((state_arrays[i][j]) * bit_per_state) - row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state)) - end - class_row[i]=row - end - print("\nconst _UTF8_DFA_TABLE = [\n") - for (class, repeats) in class_repeats - print(" fill(UInt64($(class_row[class+1])), $repeats);\n") - end - print(" ]\n") - end =# -# This table will be filled with 256 UInt64 representing the DFA transitions for all bytes -const _UTF8_DFA_TABLE = let + +# Fill the table with 256 UInt64 representing the DFA transitions for all bytes +const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base num_classes=12 num_states=10 bit_per_state = 6 @@ -316,22 +277,7 @@ const _UTF8_DFA_TABLE = let end mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats) end -# const _UTF8_DFA_TABLE = [ -# fill(UInt64(109802048057794944), 128); -# fill(UInt64(113232530780455302), 16); -# fill(UInt64(109855655693648262), 16); -# fill(UInt64(109855649351860614), 32); -# fill(UInt64(109802048057794950), 2); -# fill(UInt64(109802048057794956), 30); -# fill(UInt64(109802048057794968), 1); -# fill(UInt64(109802048057794962), 12); -# fill(UInt64(109802048057794974), 1); -# fill(UInt64(109802048057794962), 2); -# fill(UInt64(109802048057794980), 1); -# fill(UInt64(109802048057794986), 3); -# fill(UInt64(109802048057794992), 1); -# fill(UInt64(109802048057794950), 11) -# ] + const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop From a0f6a1c36e31ca557f3a3632dfd0ae30fa208f16 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Fri, 6 Jan 2023 17:04:14 -0500 Subject: [PATCH 14/34] @stevenjg recommendations --- base/strings/string.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index edb30d458e9ba..69f93edacd7f8 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -298,7 +298,8 @@ function _isvalid_utf8(bytes::Vector{UInt8}) return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT end -_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) +_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = + GC.@preserve s _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) # Classifcations of string # 0: neither valid ASCII nor UTF-8 @@ -306,7 +307,7 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = # 2: valid UTF-8 function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) bytes = unsafe_wrap(Vector{UInt8}, s) - byte_string_classify(bytes, kwargs...) + GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s)) end function byte_string_classify(bytes::Vector{UInt8}) @@ -316,8 +317,7 @@ function byte_string_classify(bytes::Vector{UInt8}) end function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) - bytes = unsafe_wrap(Vector{UInt8}, s) - isvalid(String,bytes) + GC.@preserve s isvalid(String,unsafe_wrap(Vector{UInt8}, s)) end isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes) From 24d45d40271c6aab1693a361ab12c2c98462516e Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Fri, 6 Jan 2023 17:38:15 -0500 Subject: [PATCH 15/34] fix --- base/strings/string.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 69f93edacd7f8..9d98c5ed8632f 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -306,7 +306,6 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = # 1: valid ASCII # 2: valid UTF-8 function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) - bytes = unsafe_wrap(Vector{UInt8}, s) GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s)) end From b11c1ef41cfb4c838a7e3661d04186f25cde7ca7 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 9 Jan 2023 16:35:48 -0500 Subject: [PATCH 16/34] Switch to AbstractVector --- base/strings/string.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 9d98c5ed8632f..92436ee62d04f 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -285,7 +285,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state # This function is designed so that you could use it on strings with discontinous memmory layouts # by only feeding it contiguous block and keeping track of the state inbetween. # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was. -function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) +@propagate_inbounds function _isvalid_utf8_dfa(bytes::AbstractVector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1] op(s, byte_dfa) = byte_dfa >> (s & UInt64(63)) final_state = mapfoldl(f, op, bytes, init = state) @@ -293,7 +293,7 @@ function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT end # This is a shift based utf-8 DFA that works on string that are a contiguous block -function _isvalid_utf8(bytes::Vector{UInt8}) +function _isvalid_utf8(bytes::AbstractVector{UInt8}) final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT) return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT end From 54d49fb26c83948122a0315d2cb2b572f662cd83 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 9 Jan 2023 18:58:47 -0500 Subject: [PATCH 17/34] Switch to codeunits --- base/strings/string.jl | 15 +++++---------- base/strings/substring.jl | 8 ++++---- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 92436ee62d04f..6aa47f189e788 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -298,16 +298,14 @@ function _isvalid_utf8(bytes::AbstractVector{UInt8}) return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT end -_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = - GC.@preserve s _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s)) +_isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) # Classifcations of string # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 -function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) - GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s)) -end + byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s)) + function byte_string_classify(bytes::Vector{UInt8}) all(c -> iszero(c & 0x80), bytes) && return 1 @@ -315,12 +313,9 @@ function byte_string_classify(bytes::Vector{UInt8}) return ifelse(valid, 2, 0) end -function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) - GC.@preserve s isvalid(String,unsafe_wrap(Vector{UInt8}, s)) -end -isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes) +isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes) -isvalid(s::String) = isvalid(String, s) +isvalid(s::AbstractString) = isvalid(String, codeunits(s)) is_valid_continuation(c) = c & 0xc0 == 0x80 diff --git a/base/strings/substring.jl b/base/strings/substring.jl index ea132402447be..68a0e7c7c4165 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -100,11 +100,11 @@ function isvalid(s::SubString, i::Integer) @inbounds return ib && isvalid(s.string, s.offset + i)::Bool end -byte_string_classify(s::SubString{String}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) +# byte_string_classify(s::SubString{String}) = +# ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) -isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0 -isvalid(s::SubString{String}) = isvalid(String, s) +# isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0 +# isvalid(s::SubString{String}) = isvalid(String, s) thisind(s::SubString{String}, i::Int) = _thisind_str(s, i) nextind(s::SubString{String}, i::Int) = _nextind_str(s, i) From f853b463771e7de09f47e9d26983631ea0e094a9 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 9 Jan 2023 19:29:03 -0500 Subject: [PATCH 18/34] Fix --- base/strings/string.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/base/strings/string.jl b/base/strings/string.jl index 6aa47f189e788..7f8a15cba5622 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -314,6 +314,7 @@ function byte_string_classify(bytes::Vector{UInt8}) end isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes) +isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s)) isvalid(s::AbstractString) = isvalid(String, codeunits(s)) From be9802209e1765728bdd2508d2b7293af4dbd70f Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 10 Jan 2023 16:25:32 -0500 Subject: [PATCH 19/34] Change order of operations --- base/strings/string.jl | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 7f8a15cba5622..dd26dcb504383 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -282,21 +282,18 @@ end const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop -# This function is designed so that you could use it on strings with discontinous memmory layouts -# by only feeding it contiguous block and keeping track of the state inbetween. -# Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was. -@propagate_inbounds function _isvalid_utf8_dfa(bytes::AbstractVector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT) - f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1] - op(s, byte_dfa) = byte_dfa >> (s & UInt64(63)) - final_state = mapfoldl(f, op, bytes, init = state) - return (final_state & UInt64(63)) +# The dfa step is broken out so that it may be used in other functions +@inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63) + +@inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes)) + for i = first:last + @inbounds state = _utf_dfa_step(state, bytes[i]) + end + return (state) end # This is a shift based utf-8 DFA that works on string that are a contiguous block -function _isvalid_utf8(bytes::AbstractVector{UInt8}) - final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT) - return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT -end +_isvalid_utf8(bytes::AbstractVector{UInt8}) = isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) From d54656dc76c656111c2a2b96973863d2044fa671 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 10 Jan 2023 17:00:11 -0500 Subject: [PATCH 20/34] fix --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index dd26dcb504383..ff2784e62db17 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -293,7 +293,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state end # This is a shift based utf-8 DFA that works on string that are a contiguous block -_isvalid_utf8(bytes::AbstractVector{UInt8}) = isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT +_isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) From c40daff66743855dd30bbc86dc3569daa3a87f48 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 10 Jan 2023 18:15:37 -0500 Subject: [PATCH 21/34] Add inlining & fix comments --- base/strings/string.jl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index ff2784e62db17..3f2696ee25a5e 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -223,7 +223,7 @@ end it contains the number of bit needed to shift the state it is transitioning to shifted into the position of the current state. - Example: character class 1 is encoded in below + Example: character class 1 is encoded as below Current State | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | Next State | 1 | 3 | 3 | 1 | 2 | 1 | 2 | 0 | 1 | 1 | Shift required | 6*1 | 6*3 | 6*3 | 6*1 | 6*2 | 6*1 | 6*2 | 6*0 | 6*1 | 6*1 | @@ -233,11 +233,12 @@ end Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30 so when the next character class is 7 row is in row::UInt64: The reduction operation: - state = byte_dfa >> (state & UInt64(63)) - | Shift to get the next state shift | Mask first 6 bits of starting state to get the current shift ie 30 + state = ( byte_dfa >> state ) & UInt64(63) + | Shift to get the next state shift | Mask the first six bits so that the new state is represended by the shift Would result in the state being 2 which is a shift of 12: - state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30 - state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100| + (byte_dfa = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 + >> 30 ) => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100 + & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100 =# # Fill the table with 256 UInt64 representing the DFA transitions for all bytes @@ -293,9 +294,9 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state end # This is a shift based utf-8 DFA that works on string that are a contiguous block -_isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT +@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT -_isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) +@inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) # Classifcations of string # 0: neither valid ASCII nor UTF-8 From 394c4fa96410438ceee669f4e248ebf330f6db7b Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 10 Jan 2023 18:37:23 -0500 Subject: [PATCH 22/34] Agressive inlining --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 3f2696ee25a5e..5595420ed8373 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -314,7 +314,7 @@ end isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes) isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s)) -isvalid(s::AbstractString) = isvalid(String, codeunits(s)) +@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s)) is_valid_continuation(c) = c & 0xc0 == 0x80 From a0cdd13df79a7488cda57b3c777b7e7f7915ced4 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Wed, 11 Jan 2023 07:49:37 -0500 Subject: [PATCH 23/34] Whitespace --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 5595420ed8373..9f67616f243a5 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -236,7 +236,7 @@ end state = ( byte_dfa >> state ) & UInt64(63) | Shift to get the next state shift | Mask the first six bits so that the new state is represended by the shift Would result in the state being 2 which is a shift of 12: - (byte_dfa = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 + (byte_dfa = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30 ) => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100 & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100 =# From a2691af06ad22651baea7f9e98616c44db0ec837 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Wed, 11 Jan 2023 14:04:52 -0500 Subject: [PATCH 24/34] Remove Commented Code --- base/strings/substring.jl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 68a0e7c7c4165..5ba08ac2f7fff 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -100,12 +100,6 @@ function isvalid(s::SubString, i::Integer) @inbounds return ib && isvalid(s.string, s.offset + i)::Bool end -# byte_string_classify(s::SubString{String}) = -# ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) - -# isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0 -# isvalid(s::SubString{String}) = isvalid(String, s) - thisind(s::SubString{String}, i::Int) = _thisind_str(s, i) nextind(s::SubString{String}, i::Int) = _nextind_str(s, i) From 7dc6a685611b372c339f8dc16ab709340433c24f Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Thu, 12 Jan 2023 12:57:44 -0500 Subject: [PATCH 25/34] Fix Comments --- base/strings/string.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 9f67616f243a5..d5fde47f4c046 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -199,11 +199,12 @@ end Validation States 0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well ASCII only strings will never leave this state - 1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not + 1 -> UTF8_INVALID is only reached by invalid bytes and once in this state it will not change + as seen by all 1s in that column of table below 2 -> One valid continuation byte needed to return to state 0 3,4,5 -> Two valid continuation bytes needed to return to state 0 6,7,8 -> Three valids continuation bytes needed to return to state 0 - 9 -> Not important and not used which is why it is all ones + 9 -> Not used which is why it always transitions to state 1 Current State 0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲ 0 | 0 1 1 1 1 1 1 1 1 1 @@ -289,8 +290,8 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state @inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes)) for i = first:last @inbounds state = _utf_dfa_step(state, bytes[i]) - end - return (state) + end + return (state) end # This is a shift based utf-8 DFA that works on string that are a contiguous block From b4465972e687d6f960cfbda1cf5dceb36b3c3b3c Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Sat, 4 Feb 2023 13:39:29 -0500 Subject: [PATCH 26/34] Changed DFA to track isascii & added state diagram --- base/strings/string.jl | 134 ++++++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 48 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index d5fde47f4c046..b1496628b597a 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -194,31 +194,53 @@ end ## checking UTF-8 & ACSII validity ## #= The UTF-8 Validation is performed by a shift based DFA. - Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + ┌───────────────────────────────────────────────────────────────────┐ + │ UTF-8 DFA State Diagram ┌──────────────2──────────────┐ │ + │ ├────────3────────┐ │ │ + │ ┌──────────┐ │ ┌─┐ ┌▼┐ │ │ + │ ASCII │ UTF-8 │ ├─5──►│9├───1────► │ │ │ + │ │ │ │ ├─┤ │ │ ┌▼┐ │ + │ │ ┌─0─┐ │ ├─6──►│8├─1,7,9──►4├──1,7,9──► │ │ + │ ┌─0─┐ │ │ │ │ │ ├─┤ │ │ │ │ │ + │ │ │ │ ┌▼───┴┐ │ ├─11─►│7├──7,9───► │ ┌───────►3├─┐ │ + │ ┌▼───┴┐ │ │ │ ▼ │ └─┘ └─┘ │ │ │ │ │ + │ │ 0 ├─────┘ │ 1 ├─► ──┤ │ ┌────► │ │ │ + │ └─────┘ │ │ │ ┌─┐ │ │ └─┘ │ │ + │ └──▲──┘ ├─10─►│5├─────7──────┘ │ │ │ + │ │ │ ├─┤ │ │ │ + │ │ └─4──►│6├─────1,9───────┘ │ │ + │ INVALID │ └─┘ │ │ + │ ┌─*─┐ └──────────────────1,7,9──────────────────┘ │ + │ ┌▼───┴┐ │ + │ │ 2 ◄─── All undefined transitions result in state 2 │ + │ └─────┘ │ + └───────────────────────────────────────────────────────────────────┘ Validation States - 0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well + 1 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters + If the DFA ends in this state the string is ASCII only + 1 -> _UTF8_DFA_ACCEPT is the start state and represents a complete UTF-8 String as well ASCII only strings will never leave this state - 1 -> UTF8_INVALID is only reached by invalid bytes and once in this state it will not change + 2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change as seen by all 1s in that column of table below - 2 -> One valid continuation byte needed to return to state 0 - 3,4,5 -> Two valid continuation bytes needed to return to state 0 - 6,7,8 -> Three valids continuation bytes needed to return to state 0 - 9 -> Not used which is why it always transitions to state 1 + 3 -> One valid continuation byte needed to return to state 0 + 4,5,6 -> Two valid continuation bytes needed to return to state 0 + 7,8,9 -> Three valids continuation bytes needed to return to state 0 + Current State 0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲ - 0 | 0 1 1 1 1 1 1 1 1 1 - 1 | 1 1 0 2 1 2 1 3 3 1 - 2 | 2 1 1 1 1 1 1 1 1 1 - 3 | 3 1 1 1 1 1 1 1 1 1 - 4 | 5 1 1 1 1 1 1 1 1 1 - Character 5 | 8 1 1 1 1 1 1 1 1 1 <- Next State - Class 6 | 7 1 1 1 1 1 1 1 1 1 - 7 | 1 1 0 2 2 1 3 3 1 1 - 8 | 1 1 1 1 1 1 1 1 1 1 - 9 | 1 1 0 2 1 2 3 3 1 1 - 10 | 4 1 1 1 1 1 1 1 1 1 - 11 | 6 1 1 1 1 1 1 1 1 1 + 0 | 0 1 2 2 2 2 2 2 2 2 + 1 | 2 2 2 1 3 2 3 2 4 4 + 2 | 3 3 2 2 2 2 2 2 2 2 + 3 | 4 4 2 2 2 2 2 2 2 2 + 4 | 6 6 2 2 2 2 2 2 2 2 + Character 5 | 9 9 2 2 2 2 2 2 2 2 <- Next State + Class 6 | 8 8 2 2 2 2 2 2 2 2 + 7 | 2 2 2 1 3 3 2 4 4 2 + 8 | 2 2 2 2 2 2 2 2 2 2 + 9 | 2 2 2 1 3 2 3 4 4 2 + 10 | 5 5 2 2 2 2 2 2 2 2 + 11 | 7 7 2 2 2 2 2 2 2 2 Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that it contains the number of bit needed to shift the state it is transitioning to shifted into @@ -226,19 +248,19 @@ end Example: character class 1 is encoded as below Current State | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - Next State | 1 | 3 | 3 | 1 | 2 | 1 | 2 | 0 | 1 | 1 | - Shift required | 6*1 | 6*3 | 6*3 | 6*1 | 6*2 | 6*1 | 6*2 | 6*0 | 6*1 | 6*1 | - | 6 | 18 | 18 | 6 | 12 | 6 | 12 | 0 | 6 | 6 | - UInt64(113232530780455302) = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 + Next State | 4 | 4 | 2 | 3 | 2 | 3 | 1 | 2 | 2 | 2 | + Shift required | 6*4 | 6*4 | 6*2 | 6*3 | 6*2 | 6*3 | 6*1 | 6*2 | 6*2 | 6*2 | + | 24 | 24 | 12 | 18 | 12 | 18 | 6 | 12 | 12 | 12 | + UInt64(0x061831231218c30c) = 0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100 Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30 - so when the next character class is 7 row is in row::UInt64: + so when the next character class is 1 the new state is obtained by the following operations: The reduction operation: state = ( byte_dfa >> state ) & UInt64(63) | Shift to get the next state shift | Mask the first six bits so that the new state is represended by the shift Would result in the state being 2 which is a shift of 12: - (byte_dfa = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 - >> 30 ) => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100 + (byte_dfa = 0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100 + >> 30 ) => 0b0000|000000|000000|000000|000000|000000|011000|011000|001100|010010|001100 & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100 =# @@ -247,23 +269,37 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas num_classes=12 num_states=10 bit_per_state = 6 - # class_repeats represents the 256 byte's classes by storing the (class, #of repeats) - class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1), - (3, 12), (4, 1), (3, 2), (11, 1), (6, 3), (5, 1), (8, 11)] + + character_classes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ] # These are the rows discussed in comments above - state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1], - [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1], - [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1], - [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]] + state_arrays = [[ 0 1 2 2 2 2 2 2 2 2], + [ 2 2 2 1 3 2 3 2 4 4], + [ 3 3 2 2 2 2 2 2 2 2], + [ 4 4 2 2 2 2 2 2 2 2], + [ 6 6 2 2 2 2 2 2 2 2], + [ 9 9 2 2 2 2 2 2 2 2], + [ 8 8 2 2 2 2 2 2 2 2], + [ 2 2 2 1 3 3 2 4 4 2], + [ 2 2 2 2 2 2 2 2 2 2], + [ 2 2 2 1 3 2 3 4 4 2], + [ 5 5 2 2 2 2 2 2 2 2], + [ 7 7 2 2 2 2 2 2 2 2]] #This converts the state_arrays into the shift encoded UInt64 class_row = zeros(UInt64, num_classes) @@ -277,12 +313,13 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas end class_row[i]=row end - mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats) + map(c->class_row[c+1],character_classes) end -const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string -const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop +const _UTF8_DFA_ASCII = UInt64(0) #This state represents the start and end of any valid string +const _UTF8_DFA_ACCEPT = UInt64(6) #This state represents the start and end of any valid string +const _UTF8_DFA_INVALID = UInt64(12) # If the state machine is ever in this state just stop # The dfa step is broken out so that it may be used in other functions @inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63) @@ -295,7 +332,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state end # This is a shift based utf-8 DFA that works on string that are a contiguous block -@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT +@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) <= _UTF8_DFA_ACCEPT # <= covers _UTF8_DFA_ASCII as well @inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) @@ -307,9 +344,10 @@ end function byte_string_classify(bytes::Vector{UInt8}) - all(c -> iszero(c & 0x80), bytes) && return 1 - valid = _isvalid_utf8(bytes) - return ifelse(valid, 2, 0) + state = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) + state == _UTF8_DFA_ASCII && return 1 + state == _UTF8_DFA_ACCEPT && return 2 + return 0 end isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes) From b989d0d5683ad540b84efff80b2acd6394e5995a Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Sat, 4 Feb 2023 14:25:37 -0500 Subject: [PATCH 27/34] Fix states discription --- base/strings/string.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index b1496628b597a..c08f94efdee1e 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -217,10 +217,9 @@ end └───────────────────────────────────────────────────────────────────┘ Validation States - 1 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters + 0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters If the DFA ends in this state the string is ASCII only - 1 -> _UTF8_DFA_ACCEPT is the start state and represents a complete UTF-8 String as well - ASCII only strings will never leave this state + 1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character 2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change as seen by all 1s in that column of table below 3 -> One valid continuation byte needed to return to state 0 From 27952508fce1a39096db08990032f505ac7c540f Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 6 Feb 2023 10:57:12 -0500 Subject: [PATCH 28/34] Add tests to validate DFA --- test/strings/basic.jl | 152 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index e1d6e9dd60491..7ce2b5fdc3b39 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -1234,3 +1234,155 @@ end end @test_throws ArgumentError Symbol("a\0a") end + +@testset "Ensure UTF-8 DFA can never leave invalid state" begin + for b = typemin(UInt8):typemax(UInt8) + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID + end +end +@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin + for b = 0x00:0x7F + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII + end +end + +@testset "Validate UTF-8 DFA" begin + # Unicode 15 + # Table 3-7. Well-Formed UTF-8 Byte Sequences + + table_rows = [ [0x00:0x7F], + [0xC2:0xDF,0x80:0xBF], + [0xE0:0xE0,0xA0:0xBF,0x80:0xBF], + [0xE1:0xEC,0x80:0xBF,0x80:0xBF], + [0xED:0xED,0x80:0x9F,0x80:0xBF], + [0xEE:0xEF,0x80:0xBF,0x80:0xBF], + [0xF0:0xF0,0x90:0xBF,0x80:0xBF,0x80:0xBF], + [0xF1:0xF3,0x80:0xBF,0x80:0xBF,0x80:0xBF], + [0xF4:0xF4,0x80:0x8F,0x80:0xBF,0x80:0xBF]] + invalid_first_bytes = union(0xC0:0xC1,0xF5:0xFF,0x80:0xBF) + + valid_first_bytes = union(collect(first(r) for r in table_rows)...) + + + + # Prove that the first byte sets in the table & invalid cover all bytes + @test length(union(valid_first_bytes,invalid_first_bytes)) == 256 + @test length(intersect(valid_first_bytes,invalid_first_bytes)) == 0 + + #Check the ASCII range + for b = 0x00:0x7F + #Test from both UTF-8 state and ascii state + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_ACCEPT + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII + end + + #Check the remaining first bytes + for b = 0x80:0xFF + if b ∈ invalid_first_bytes + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_INVALID + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_INVALID + else + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) != Base._UTF8_DFA_INVALID + @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) != Base._UTF8_DFA_INVALID + end + end + + # Check two byte Sequences + for table_row in [table_rows[2]] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + b1 = first(table_row[1]) + #Prove that all valid second bytes return correct state + for b2 = table_row[2] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + end + + # Check three byte Sequences + for table_row in table_rows[3:6] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + + b1 = first(table_row[1]) + b2 = first(table_row[2]) + #Prove that all valid second bytes return same state + state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1) + for b2 = table_row[2] + @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + + b2 = first(table_row[2]) + #Prove that all valid third bytes return correct state + for b3 = table_row[3] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + for b3 = setdiff(0x00:0xFF,table_row[3]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + end + + # Check Four byte Sequences + for table_row in table_rows[7:9] + b1 = first(table_row[1]) + state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + @test state1 == state2 + #Prove that all the first bytes in a row give same state + for b1 in table_row[1] + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1) + @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1) + end + + b1 = first(table_row[1]) + b2 = first(table_row[2]) + #Prove that all valid second bytes return same state + state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1) + for b2 = table_row[2] + @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + for b2 = setdiff(0x00:0xFF,table_row[2]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1) + end + + + b2 = first(table_row[2]) + b3 = first(table_row[3]) + state3 = Base._isvalid_utf8_dfa(state2,[b3],1,1) + #Prove that all valid third bytes return same state + for b3 = table_row[3] + @test state3 == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + for b3 = setdiff(0x00:0xFF,table_row[3]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1) + end + + b3 = first(table_row[3]) + #Prove that all valid forth bytes return correct state + for b4 = table_row[4] + @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state3,[b4],1,1) + end + for b4 = setdiff(0x00:0xFF,table_row[4]) + @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1) + end + end +end \ No newline at end of file From bc0e662519f31eb5f121eed258b11b9d10d61b95 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 6 Feb 2023 14:39:27 -0500 Subject: [PATCH 29/34] Trailing newline --- test/strings/basic.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 7ce2b5fdc3b39..602c38551f6d8 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -1240,7 +1240,7 @@ end @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID end end -@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin +@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin for b = 0x00:0x7F @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII end @@ -1385,4 +1385,4 @@ end @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1) end end -end \ No newline at end of file +end From 557bda642b8db91f63305c9759470009c11e4b11 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 14 Feb 2023 11:27:20 -0500 Subject: [PATCH 30/34] State to UInt32 & use SMTsolver derived shifts --- base/strings/string.jl | 63 ++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index c08f94efdee1e..dd845e5277866 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -241,34 +241,29 @@ end 10 | 5 5 2 2 2 2 2 2 2 2 11 | 7 7 2 2 2 2 2 2 2 2 - Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that - it contains the number of bit needed to shift the state it is transitioning to shifted into - the position of the current state. - - Example: character class 1 is encoded as below - Current State | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - Next State | 4 | 4 | 2 | 3 | 2 | 3 | 1 | 2 | 2 | 2 | - Shift required | 6*4 | 6*4 | 6*2 | 6*3 | 6*2 | 6*3 | 6*1 | 6*2 | 6*2 | 6*2 | - | 24 | 24 | 12 | 18 | 12 | 18 | 6 | 12 | 12 | 12 | - UInt64(0x061831231218c30c) = 0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100 - - Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30 - so when the next character class is 1 the new state is obtained by the following operations: - The reduction operation: - state = ( byte_dfa >> state ) & UInt64(63) - | Shift to get the next state shift | Mask the first six bits so that the new state is represended by the shift - Would result in the state being 2 which is a shift of 12: - (byte_dfa = 0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100 - >> 30 ) => 0b0000|000000|000000|000000|000000|000000|011000|011000|001100|010010|001100 - & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100 + Shifts | 0 4 10 14 18 24 8 20 12 26 + + The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into + the rows the correct shift was a result. + + Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by + the current state then masking the result with 0x11110 give the shift for the new state + + =# +#State type used by UTF-8 DFA +const _UTF8DFAState = UInt32 # Fill the table with 256 UInt64 representing the DFA transitions for all bytes const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base num_classes=12 num_states=10 bit_per_state = 6 + + # These shifts were derived using a SMT solver + state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26] + character_classes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,8 +278,8 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, - 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ] + 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ] # These are the rows discussed in comments above state_arrays = [[ 0 1 2 2 2 2 2 2 2 2], @@ -300,30 +295,32 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas [ 5 5 2 2 2 2 2 2 2 2], [ 7 7 2 2 2 2 2 2 2 2]] - #This converts the state_arrays into the shift encoded UInt64 - class_row = zeros(UInt64, num_classes) + #This converts the state_arrays into the shift encoded _UTF8DFAState + class_row = zeros(_UTF8DFAState, num_classes) + for i = 1:num_classes - row = UInt64(0) + row = _UTF8DFAState(0) for j in 1:num_states #Calculate the shift required for the next state - to_shift = UInt8((state_arrays[i][j]) * bit_per_state) + to_shift = UInt8((state_shifts[state_arrays[i][j]+1]) ) #Shift the next state into the position of the current state - row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state)) + row = row | (_UTF8DFAState(to_shift) << state_shifts[j]) end class_row[i]=row end + map(c->class_row[c+1],character_classes) end -const _UTF8_DFA_ASCII = UInt64(0) #This state represents the start and end of any valid string -const _UTF8_DFA_ACCEPT = UInt64(6) #This state represents the start and end of any valid string -const _UTF8_DFA_INVALID = UInt64(12) # If the state machine is ever in this state just stop +const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string +const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string +const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop -# The dfa step is broken out so that it may be used in other functions -@inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63) +# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above +@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E) -@inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes)) +@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes)) for i = first:last @inbounds state = _utf_dfa_step(state, bytes[i]) end From aaf886371f02fc0cdb77c1f0b1020470ce6b54ef Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Tue, 14 Mar 2023 18:21:57 -0400 Subject: [PATCH 31/34] Add Chunk based byte_string_classify --- base/strings/string.jl | 57 +++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index dd845e5277866..607ef8f1dfdb9 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -260,7 +260,6 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas num_states=10 bit_per_state = 6 - # These shifts were derived using a SMT solver state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26] @@ -327,10 +326,20 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th return (state) end -# This is a shift based utf-8 DFA that works on string that are a contiguous block -@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) <= _UTF8_DFA_ACCEPT # <= covers _UTF8_DFA_ASCII as well - -@inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s)) +@inline function _find_nonascii_chunk(cu::AbstractVector{UInt8}, first::Int, last::Int) + chunk_size = 256 + epilog_bytes = rem(last - first + 1, chunk_size) + start = first + chunk_last = last - epilog_bytes + start > last && return nothing + for start = start:chunk_size:chunk_last + _isascii(cu, start, start + chunk_size - 1) || return start + end + start = chunk_last + 1 + ((start <= last) && _isascii(cu, start, last)) || return start + return nothing +end +## # Classifcations of string # 0: neither valid ASCII nor UTF-8 @@ -339,15 +348,39 @@ end byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s)) -function byte_string_classify(bytes::Vector{UInt8}) - state = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) - state == _UTF8_DFA_ASCII && return 1 - state == _UTF8_DFA_ACCEPT && return 2 - return 0 +function byte_string_classify(bytes::AbstractVector{UInt8}) + n = length(bytes) + start = _find_nonascii_chunk(bytes,1,n) + isnothing(start) && return 1 + + return _byte_string_classify_nonascii(bytes,start,n) +end + +function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int) + chunk_size = 256 + + start = first + stop = min(last,first + chunk_size - 1) + state = _UTF8_DFA_ACCEPT + while start <= last + # Process non ascii chunk + state = _isvalid_utf8_dfa(state,bytes,start,stop) + state == _UTF8_DFA_INVALID && return 0 + + start = start + chunk_size + stop = min(last,stop + chunk_size) + # try to process ascii chunks + while state == _UTF8_DFA_ACCEPT + _isascii(bytes,start,stop) || break + (start = start + chunk_size) <= last || break + stop = min(last,stop + chunk_size) + end + end + return ifelse(state == _UTF8_DFA_ACCEPT,2,0) end -isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes) -isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s)) +isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0 +isvalid(::Type{String}, s::AbstractString) = (@inline byte_string_classify(s)) ≠ 0 @inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s)) From 8006d60d896b374825887e1c026df51162bfb836 Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Thu, 23 Mar 2023 12:39:14 -0400 Subject: [PATCH 32/34] Chunk based processing --- base/strings/string.jl | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 607ef8f1dfdb9..e75d984739571 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -326,19 +326,17 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th return (state) end -@inline function _find_nonascii_chunk(cu::AbstractVector{UInt8}, first::Int, last::Int) - chunk_size = 256 - epilog_bytes = rem(last - first + 1, chunk_size) - start = first - chunk_last = last - epilog_bytes - start > last && return nothing - for start = start:chunk_size:chunk_last - _isascii(cu, start, start + chunk_size - 1) || return start +@inline function _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU} + n=first + while n <= last - chunk_size + _isascii(cu,n,n+chunk_size-1) || return n + n += chunk_size end - start = chunk_last + 1 - ((start <= last) && _isascii(cu, start, last)) || return start + n= last-chunk_size+1 + _isascii(cu,n,last) || return n return nothing end + ## # Classifcations of string @@ -349,10 +347,16 @@ end function byte_string_classify(bytes::AbstractVector{UInt8}) + chunk_size = 1024 + chunk_threshold = chunk_size + (chunk_size ÷ 2) n = length(bytes) - start = _find_nonascii_chunk(bytes,1,n) - isnothing(start) && return 1 - + if n > chunk_threshold + start = _find_nonascii_chunk(chunk_size,bytes,1,n) + isnothing(start) && return 1 + else + _isascii(bytes,1,n) && return 1 + start = 1 + end return _byte_string_classify_nonascii(bytes,start,n) end @@ -363,18 +367,18 @@ function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int stop = min(last,first + chunk_size - 1) state = _UTF8_DFA_ACCEPT while start <= last - # Process non ascii chunk - state = _isvalid_utf8_dfa(state,bytes,start,stop) - state == _UTF8_DFA_INVALID && return 0 - - start = start + chunk_size - stop = min(last,stop + chunk_size) # try to process ascii chunks while state == _UTF8_DFA_ACCEPT _isascii(bytes,start,stop) || break (start = start + chunk_size) <= last || break stop = min(last,stop + chunk_size) end + # Process non ascii chunk + state = _isvalid_utf8_dfa(state,bytes,start,stop) + state == _UTF8_DFA_INVALID && return 0 + + start = start + chunk_size + stop = min(last,stop + chunk_size) end return ifelse(state == _UTF8_DFA_ACCEPT,2,0) end From a6e338384757d13d42ef108b7a342566be7e8fa9 Mon Sep 17 00:00:00 2001 From: ndinsmore <45537276+ndinsmore@users.noreply.github.com> Date: Fri, 7 Apr 2023 11:34:04 -0400 Subject: [PATCH 33/34] Update base/strings/string.jl Co-authored-by: Steven G. Johnson --- base/strings/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index e75d984739571..a8d6907e9a78f 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -319,7 +319,7 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th # The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above @inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E) -@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes)) +@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes)) for i = first:last @inbounds state = _utf_dfa_step(state, bytes[i]) end From d1a129c5e59cd581f0078e4bfb17557fb6fe197b Mon Sep 17 00:00:00 2001 From: Nicholas R Dinsmore Date: Mon, 10 Apr 2023 11:49:26 -0400 Subject: [PATCH 34/34] Change State Arrays to a matrix --- base/strings/string.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index e75d984739571..6040c8254b33a 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -281,18 +281,18 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ] # These are the rows discussed in comments above - state_arrays = [[ 0 1 2 2 2 2 2 2 2 2], - [ 2 2 2 1 3 2 3 2 4 4], - [ 3 3 2 2 2 2 2 2 2 2], - [ 4 4 2 2 2 2 2 2 2 2], - [ 6 6 2 2 2 2 2 2 2 2], - [ 9 9 2 2 2 2 2 2 2 2], - [ 8 8 2 2 2 2 2 2 2 2], - [ 2 2 2 1 3 3 2 4 4 2], - [ 2 2 2 2 2 2 2 2 2 2], - [ 2 2 2 1 3 2 3 4 4 2], - [ 5 5 2 2 2 2 2 2 2 2], - [ 7 7 2 2 2 2 2 2 2 2]] + state_arrays = [ 0 1 2 2 2 2 2 2 2 2; + 2 2 2 1 3 2 3 2 4 4; + 3 3 2 2 2 2 2 2 2 2; + 4 4 2 2 2 2 2 2 2 2; + 6 6 2 2 2 2 2 2 2 2; + 9 9 2 2 2 2 2 2 2 2; + 8 8 2 2 2 2 2 2 2 2; + 2 2 2 1 3 3 2 4 4 2; + 2 2 2 2 2 2 2 2 2 2; + 2 2 2 1 3 2 3 4 4 2; + 5 5 2 2 2 2 2 2 2 2; + 7 7 2 2 2 2 2 2 2 2] #This converts the state_arrays into the shift encoded _UTF8DFAState class_row = zeros(_UTF8DFAState, num_classes) @@ -301,7 +301,7 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas row = _UTF8DFAState(0) for j in 1:num_states #Calculate the shift required for the next state - to_shift = UInt8((state_shifts[state_arrays[i][j]+1]) ) + to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) ) #Shift the next state into the position of the current state row = row | (_UTF8DFAState(to_shift) << state_shifts[j]) end