JuliaLang · oscardssmith · Apr 12, 2023 · Dec 12, 2022 · Dec 12, 2022 · Dec 13, 2022
diff --git a/base/strings/string.jl b/base/strings/string.jl
@@ -98,6 +98,7 @@ String(s::AbstractString) = print_to_string(s)
 @assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
 
 unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
+unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))
 
 Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
 Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
@@ -191,15 +192,201 @@ end
 end
 
 ## checking UTF-8 & ACSII validity ##
+#=
+    The UTF-8 Validation is performed by a shift based DFA.
+    ┌───────────────────────────────────────────────────────────────────┐
+    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
+    │                               ├────────3────────┐           │     │
+    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
+    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
+    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
+    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
+    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
+    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
+    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
+    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
+    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
+    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
+    │                      │        │     ├─┤               │        │  │
+    │                      │        └─4──►│6├─────1,9───────┘        │  │
+    │          INVALID     │              └─┘                        │  │
+    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
+    │          ┌▼───┴┐                                                  │
+    │          │  2  ◄─── All undefined transitions result in state 2   │
+    │          └─────┘                                                  │
+    └───────────────────────────────────────────────────────────────────┘
+
+        Validation States
+            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
+                        If the DFA ends in this state the string is ASCII only
+            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
+            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
+                    as seen by all 1s in that column of table below
+            3 -> One valid continuation byte needed to return to state 0
+        4,5,6 -> Two valid continuation bytes needed to return to state 0
+        7,8,9 -> Three valids continuation bytes needed to return to state 0
+
+                        Current State
+                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
+                0 | 0  1  2  2  2  2  2  2  2  2
+                1 | 2  2  2  1  3  2  3  2  4  4
+                2 | 3  3  2  2  2  2  2  2  2  2
+                3 | 4  4  2  2  2  2  2  2  2  2
+                4 | 6  6  2  2  2  2  2  2  2  2
+    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
+    Class       6 | 8  8  2  2  2  2  2  2  2  2
+                7 | 2  2  2  1  3  3  2  4  4  2
+                8 | 2  2  2  2  2  2  2  2  2  2
+                9 | 2  2  2  1  3  2  3  4  4  2
+               10 | 5  5  2  2  2  2  2  2  2  2
+               11 | 7  7  2  2  2  2  2  2  2  2
+
+           Shifts | 0  4 10 14 18 24  8 20 12 26
+
+    The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into
+    the rows the correct shift was a result.
+
+    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
+    the current state then masking the result with 0x11110 give the shift for the new state
+
+
+=#
+
+#State type used by UTF-8 DFA
+const _UTF8DFAState = UInt32
+# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
+const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
+    num_classes=12
+    num_states=10
+    bit_per_state = 6
+
+    # These shifts were derived using a SMT solver
+    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
+
+    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
+                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
+
+    # These are the rows discussed in comments above
+    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  2  3  2  4  4;
+                     3  3  2  2  2  2  2  2  2  2;
+                     4  4  2  2  2  2  2  2  2  2;
+                     6  6  2  2  2  2  2  2  2  2;
+                     9  9  2  2  2  2  2  2  2  2;
+                     8  8  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  3  2  4  4  2;
+                     2  2  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  2  3  4  4  2;
+                     5  5  2  2  2  2  2  2  2  2;
+                     7  7  2  2  2  2  2  2  2  2]
+
+    #This converts the state_arrays into the shift encoded _UTF8DFAState
+    class_row = zeros(_UTF8DFAState, num_classes)
+
+    for i = 1:num_classes
+        row = _UTF8DFAState(0)
+        for j in 1:num_states
+            #Calculate the shift required for the next state
+            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
+            #Shift the next state into the position of the current state
+            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
+        end
+        class_row[i]=row
+    end
+
+    map(c->class_row[c+1],character_classes)
+end
+
+
+const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
+const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
+const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
+
+# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
+@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
+
+@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
+    for i = first:last
+       @inbounds state = _utf_dfa_step(state, bytes[i])
+    end
+    return (state)
+end
+
+@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
+    n=first
+    while n <= last - chunk_size
+        _isascii(cu,n,n+chunk_size-1) || return n
+        n += chunk_size
+    end
+    n= last-chunk_size+1
+    _isascii(cu,n,last) || return n
+    return nothing
+end
+
+##
 
-byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
-    ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
+# Classifcations of string
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
+ byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
+
+
+function byte_string_classify(bytes::AbstractVector{UInt8})
+    chunk_size = 1024
+    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
+    n = length(bytes)
+    if n > chunk_threshold
+        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
+        isnothing(start) && return 1
+    else
+        _isascii(bytes,1,n) && return 1
+        start = 1
+    end
+    return _byte_string_classify_nonascii(bytes,start,n)
+end
+
+function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
+    chunk_size = 256
+
+    start = first
+    stop = min(last,first + chunk_size - 1)
+    state = _UTF8_DFA_ACCEPT
+    while start <= last
+        # try to process ascii chunks
+        while state == _UTF8_DFA_ACCEPT
+            _isascii(bytes,start,stop) || break
+            (start = start + chunk_size) <= last || break
+            stop = min(last,stop + chunk_size)
+        end
+        # Process non ascii chunk
+        state = _isvalid_utf8_dfa(state,bytes,start,stop)
+        state == _UTF8_DFA_INVALID && return 0
+
+        start = start + chunk_size
+        stop = min(last,stop + chunk_size)
+    end
+    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
+end
+
+isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
+isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
 
-isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
-isvalid(s::String) = isvalid(String, s)
+@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
 
 is_valid_continuation(c) = c & 0xc0 == 0x80
 

diff --git a/base/strings/substring.jl b/base/strings/substring.jl
@@ -100,12 +100,6 @@ function isvalid(s::SubString, i::Integer)
     @inbounds return ib && isvalid(s.string, s.offset + i)::Bool
 end
 
-byte_string_classify(s::SubString{String}) =
-    ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
-
-isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0
-isvalid(s::SubString{String}) = isvalid(String, s)
-
 thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
 nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)
 

diff --git a/test/strings/basic.jl b/test/strings/basic.jl
@@ -1234,3 +1234,155 @@ end
     end
     @test_throws ArgumentError Symbol("a\0a")
 end
+
+@testset "Ensure UTF-8 DFA can never leave invalid state" begin
+    for b = typemin(UInt8):typemax(UInt8)
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID
+    end
+end
+@testset "Ensure  UTF-8 DFA stays in ASCII State for all ASCII" begin
+    for b = 0x00:0x7F
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
+    end
+end
+
+@testset "Validate UTF-8 DFA" begin
+    # Unicode 15
+    # Table 3-7. Well-Formed UTF-8 Byte Sequences
+
+    table_rows = [  [0x00:0x7F],
+                    [0xC2:0xDF,0x80:0xBF],
+                    [0xE0:0xE0,0xA0:0xBF,0x80:0xBF],
+                    [0xE1:0xEC,0x80:0xBF,0x80:0xBF],
+                    [0xED:0xED,0x80:0x9F,0x80:0xBF],
+                    [0xEE:0xEF,0x80:0xBF,0x80:0xBF],
+                    [0xF0:0xF0,0x90:0xBF,0x80:0xBF,0x80:0xBF],
+                    [0xF1:0xF3,0x80:0xBF,0x80:0xBF,0x80:0xBF],
+                    [0xF4:0xF4,0x80:0x8F,0x80:0xBF,0x80:0xBF]]
+    invalid_first_bytes = union(0xC0:0xC1,0xF5:0xFF,0x80:0xBF)
+
+    valid_first_bytes = union(collect(first(r) for r in table_rows)...)
+
+
+
+    # Prove that the first byte sets in the table & invalid cover all bytes
+    @test length(union(valid_first_bytes,invalid_first_bytes)) == 256
+    @test length(intersect(valid_first_bytes,invalid_first_bytes)) == 0
+
+    #Check the ASCII range
+    for b = 0x00:0x7F
+        #Test from both UTF-8 state and ascii state
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_ACCEPT
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
+    end
+
+    #Check the remaining first bytes
+    for b = 0x80:0xFF
+        if b ∈ invalid_first_bytes
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_INVALID
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_INVALID
+        else
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) != Base._UTF8_DFA_INVALID
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) != Base._UTF8_DFA_INVALID
+        end
+    end
+
+    # Check two byte Sequences
+    for table_row in [table_rows[2]]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+        b1 = first(table_row[1])
+        #Prove that all valid second bytes return correct state
+        for b2 = table_row[2]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+    end
+
+    # Check three byte Sequences
+    for table_row in table_rows[3:6]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+
+        b1 = first(table_row[1])
+        b2 = first(table_row[2])
+        #Prove that all valid second bytes return same state
+        state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        for b2 = table_row[2]
+            @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+
+        b2 = first(table_row[2])
+        #Prove that all valid third bytes return correct state
+        for b3 = table_row[3]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+        for b3 = setdiff(0x00:0xFF,table_row[3])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+    end
+
+    # Check Four byte Sequences
+    for table_row in table_rows[7:9]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+
+        b1 = first(table_row[1])
+        b2 = first(table_row[2])
+        #Prove that all valid second bytes return same state
+        state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        for b2 = table_row[2]
+            @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+
+
+        b2 = first(table_row[2])
+        b3 = first(table_row[3])
+        state3 = Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        #Prove that all valid third bytes return same state
+        for b3 = table_row[3]
+            @test state3 == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+        for b3 = setdiff(0x00:0xFF,table_row[3])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+
+        b3 = first(table_row[3])
+        #Prove that all valid forth bytes return correct state
+        for b4 = table_row[4]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state3,[b4],1,1)
+        end
+        for b4 = setdiff(0x00:0xFF,table_row[4])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1)
+        end
+    end
+end