From a82388019c940668c33ba187d675f97ac29f1768 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 12 Dec 2022 16:52:20 -0500
Subject: [PATCH 01/34] Working Native UTF-8 Validation

---
 base/strings/string.jl | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index ac1403f01a4a1..cd1c11a3543c0 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -191,13 +191,47 @@ end
 end
 
 ## checking UTF-8 & ACSII validity ##
+# This table is used by the shift base DFA validation for UTF-8
+const dfa_table = [      
+       fill(UInt64(384), 128);
+       fill(UInt64(6860978528477184), 16);
+       fill(UInt64(107228354863104), 16);
+       fill(UInt64(107202588205056), 32);
+       fill(UInt64(0), 2);
+       fill(UInt64(768), 30);
+       fill(UInt64(1152), 1);
+       fill(UInt64(1536), 12);
+       fill(UInt64(1920), 1);
+       fill(UInt64(1536), 2);
+       fill(UInt64(2304), 1);
+       fill(UInt64(2688), 3);
+       fill(UInt64(3072), 1);
+       fill(UInt64(0), 11)
+]::Vector{UInt64}
+
+# This is a shift based utf-8 DFA
+function _isvalid_utf8(bytes::Vector{UInt8})
+    f(byte) = @inbounds dfa_table[byte]
+    op(state, byte_dfa) = byte_dfa >> (state & UInt64(63))
+    return mapfoldl(f, op, bytes, init=UInt64(6)) == UInt64(6)
+end
+
+_isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 
-byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
-    ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
+#Classifcations of string
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
+function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; ascii_fasttrack = true )
+    bytes = unsafe_wrap(Vector{UInt8}, s)
+    ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1
+    valid = _isvalid_utf8(bytes)
+    return ifelse(valid, 2, 0)
+end
 
+# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that 
+# the benefit doesn't show up.   It would also remove the ascii fast track that is faster for inputs that are all ascii
+# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) =  _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
 isvalid(s::String) = isvalid(String, s)
 

From 92a72d72ba5ade0494d7e48a46f37f994a671e3a Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 12 Dec 2022 17:28:45 -0500
Subject: [PATCH 02/34] Comment fix

---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index cd1c11a3543c0..0c392dd739b4f 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -218,7 +218,7 @@ end
 
 _isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 
-#Classifcations of string
+# Classifcations of string
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8

From 34a937d8e6a1d98ce30463d4787bb3d0c84b65bf Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 10:13:46 -0500
Subject: [PATCH 03/34] Appears working

---
 base/strings/string.jl | 48 ++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 0c392dd739b4f..4bfec6eed53c6 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -98,6 +98,7 @@ String(s::AbstractString) = print_to_string(s)
 @assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
 
 unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
+unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))
 
 Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
 Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
@@ -192,44 +193,49 @@ end
 
 ## checking UTF-8 & ACSII validity ##
 # This table is used by the shift base DFA validation for UTF-8
-const dfa_table = [      
-       fill(UInt64(384), 128);
-       fill(UInt64(6860978528477184), 16);
-       fill(UInt64(107228354863104), 16);
-       fill(UInt64(107202588205056), 32);
-       fill(UInt64(0), 2);
-       fill(UInt64(768), 30);
-       fill(UInt64(1152), 1);
-       fill(UInt64(1536), 12);
-       fill(UInt64(1920), 1);
-       fill(UInt64(1536), 2);
-       fill(UInt64(2304), 1);
-       fill(UInt64(2688), 3);
-       fill(UInt64(3072), 1);
-       fill(UInt64(0), 11)
-]::Vector{UInt64}
+const dfa_table = [
+    fill(UInt64(109802048057794944), 128);
+    fill(UInt64(113232530780455302), 16);
+    fill(UInt64(109855655693648262), 16);
+    fill(UInt64(109855649351860614), 32);
+    fill(UInt64(109802048057794950), 2);
+    fill(UInt64(109802048057794956), 30);
+    fill(UInt64(109802048057794968), 1);
+    fill(UInt64(109802048057794962), 12);
+    fill(UInt64(109802048057794974), 1);
+    fill(UInt64(109802048057794962), 2);
+    fill(UInt64(109802048057794980), 1);
+    fill(UInt64(109802048057794986), 3);
+    fill(UInt64(109802048057794992), 1);
+    fill(UInt64(109802048057794950), 11)
+    ]::Vector{UInt64}
 
 # This is a shift based utf-8 DFA
 function _isvalid_utf8(bytes::Vector{UInt8})
-    f(byte) = @inbounds dfa_table[byte]
+    f(byte) = @inbounds dfa_table[byte+1]
     op(state, byte_dfa) = byte_dfa >> (state & UInt64(63))
-    return mapfoldl(f, op, bytes, init=UInt64(6)) == UInt64(6)
+    final_state = mapfoldl(f, op, bytes, init = UInt64(0))
+    return (final_state & UInt64(63)) == UInt64(0)
 end
 
-_isvalid_utf8(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
+_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 
 # Classifcations of string
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
-function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; ascii_fasttrack = true )
+function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...)
     bytes = unsafe_wrap(Vector{UInt8}, s)
+    byte_string_classify(bytes, kwargs...)
+end
+
+function byte_string_classify(bytes::Vector{UInt8}; ascii_fasttrack = true )
     ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1
     valid = _isvalid_utf8(bytes)
     return ifelse(valid, 2, 0)
 end
 
-# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that 
+# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that
 # the benefit doesn't show up.   It would also remove the ascii fast track that is faster for inputs that are all ascii
 # isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) =  _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0

From 2affea139f26fb25db899ed512096b4f93a905fa Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 11:45:08 -0500
Subject: [PATCH 04/34] Slight Fix and push for buildkite

---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 4bfec6eed53c6..23e3b680100bb 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -224,7 +224,7 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
-function byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...)
+function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...)
     bytes = unsafe_wrap(Vector{UInt8}, s)
     byte_string_classify(bytes, kwargs...)
 end

From 21b52aacce6a47e842c8d383f197df45c664bccd Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 12:39:55 -0500
Subject: [PATCH 05/34] Spit out statemachine and added comments

---
 base/strings/string.jl | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 23e3b680100bb..845dc32045f89 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -193,7 +193,7 @@ end
 
 ## checking UTF-8 & ACSII validity ##
 # This table is used by the shift base DFA validation for UTF-8
-const dfa_table = [
+const _UTF8_DFA_TABLE = [
     fill(UInt64(109802048057794944), 128);
     fill(UInt64(113232530780455302), 16);
     fill(UInt64(109855655693648262), 16);
@@ -210,12 +210,25 @@ const dfa_table = [
     fill(UInt64(109802048057794950), 11)
     ]::Vector{UInt64}
 
-# This is a shift based utf-8 DFA
+const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
+const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
+
+# This function is designed so that you could use it on strings with discontinous memmory layouts
+#   by only feeding it contiguous block and keeping track of the state inbetween. 
+# Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was.
+# For a contiguous bytestream other states are valid other than _UTF8_DFA_ACCEPT aslong as you aren't
+#  at the begining or end
+function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
+    f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1]
+    op(s, byte_dfa) = byte_dfa >> (s & UInt64(63))
+    final_state = mapfoldl(f, op, bytes, init = state)
+    return (final_state & UInt64(63)) 
+end
+
+# This is a shift based utf-8 DFA that works on string that are a contiguous block
 function _isvalid_utf8(bytes::Vector{UInt8})
-    f(byte) = @inbounds dfa_table[byte+1]
-    op(state, byte_dfa) = byte_dfa >> (state & UInt64(63))
-    final_state = mapfoldl(f, op, bytes, init = UInt64(0))
-    return (final_state & UInt64(63)) == UInt64(0)
+    final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT)
+    return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT
 end
 
 _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
@@ -237,8 +250,8 @@ end
 
 # The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that
 # the benefit doesn't show up.   It would also remove the ascii fast track that is faster for inputs that are all ascii
-# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) =  _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
-isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
+# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) =  _isvalid_utf8(s)
+ isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
 isvalid(s::String) = isvalid(String, s)
 
 is_valid_continuation(c) = c & 0xc0 == 0x80

From 57a4d2ab5ab4a58136e58b82e330cd885a565327 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 13:23:51 -0500
Subject: [PATCH 06/34] Remove Fastpath f& simplify isvalid

---
 base/strings/string.jl | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 845dc32045f89..6f6b59616cee5 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -237,21 +237,26 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
-function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}; kwargs...)
+function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}})
     bytes = unsafe_wrap(Vector{UInt8}, s)
     byte_string_classify(bytes, kwargs...)
 end
 
-function byte_string_classify(bytes::Vector{UInt8}; ascii_fasttrack = true )
-    ascii_fasttrack && all(c -> iszero(c & 0x80), bytes) && return 1
+function byte_string_classify(bytes::Vector{UInt8})
+    all(c -> iszero(c & 0x80), bytes) && return 1
     valid = _isvalid_utf8(bytes)
     return ifelse(valid, 2, 0)
 end
 
-# The commented line below should be faster than an impimentation using byte_string_classify but compiler optimizations make it so that
-# the benefit doesn't show up.   It would also remove the ascii fast track that is faster for inputs that are all ascii
-# isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) =  _isvalid_utf8(s)
- isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
+function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) 
+    bytes = unsafe_wrap(Vector{UInt8}, s)
+    isvalid(String,bytes)
+end
+function isvalid(::Type{String}, bytes::Vector{UInt8}) 
+    valid = _isvalid_utf8(bytes)
+    return ifelse(valid, true, false)
+end
+
 isvalid(s::String) = isvalid(String, s)
 
 is_valid_continuation(c) = c & 0xc0 == 0x80

From 877ba93953bcf66f73b471f811118a3507d448a7 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 18:21:46 -0500
Subject: [PATCH 07/34] Minor fixes and mega comment on methodolgy

---
 base/strings/string.jl | 94 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 6 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 6f6b59616cee5..f3b62d82825cd 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -192,7 +192,92 @@ end
 end
 
 ## checking UTF-8 & ACSII validity ##
-# This table is used by the shift base DFA validation for UTF-8
+#=
+    The UTF-8 Validation is performed by a shift based DFA.
+    Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+
+        Important States
+            0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well
+                        ASCII only strings will never leave this state
+            1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not
+            2 -> This is the state before the last byte of a multibyte character is read
+            9 -> Not important and not used which is why it is all ones
+                        Current State
+                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
+                0 | 0  1  1  1  1  1  1  1  1  1
+                1 | 1  1  0  2  1  2  1  3  3  1
+                2 | 2  1  1  1  1  1  1  1  1  1
+                3 | 3  1  1  1  1  1  1  1  1  1
+                4 | 5  1  1  1  1  1  1  1  1  1
+    Character   5 | 8  1  1  1  1  1  1  1  1  1
+    Class       6 | 7  1  1  1  1  1  1  1  1  1
+                7 | 1  1  0  2  2  1  3  3  1  1
+                8 | 1  1  1  1  1  1  1  1  1  1
+                9 | 1  1  0  2  1  2  3  3  1  1
+               10 | 1  1  1  1  1  1  1  1  1  1
+               11 | 6  1  1  1  1  1  1  1  1  1
+    
+    Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that
+    it contains the number of bit needed to shift the state it is transitioning to shifted into
+    the position of the current state.
+
+    Example: character class 1 is encoded in below
+                    Current State        |    9 |    8 |    7 |    6 |    5 |    4 |    3 |    2 |    1 |    0 |
+                    Next State           |    1 |    3 |    3 |    1 |    2 |    1 |    2 |    0 |    1 |    1 |
+                    Shift required       |  6*1 |  6*3 |  6*3 |  6*1 |  6*2 |  6*1 |  6*2 |  6*0 |  6*1 |  6*1 |
+                                         |    6 |   18 |   18 |    6 |   12 |    6 |   12 |    0 |    6 |    6 |
+    UInt64(113232530780455302) =   0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110
+
+    Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30
+    so when the next character class is 7 row is in row::UInt64: 
+            The reduction operation:
+                state =  byte_dfa         >>                        (state & UInt64(63))
+                        | Shift to get the next state shift  | Mask first 6 bits of starting state to get the current shift ie 30
+            Would result in the state being 2 which is a shift of 12:
+                state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30
+                state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100|
+
+    The code below will create the _UTF8_DFA_TABLE to be pasted in source.
+    It is included here in an effort to document a contrived process.
+    Do Not Uncomment the code below in this file it should be pasted into REPL
+
+            function build_utf8_validation_statemachine_table(; num_classes=12, num_states=10, bit_per_state = 6)
+
+                # class_repeats represents the 256 byte's classes by storing the (class, #of repeats)
+                class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1),
+                                (3,  12), (4,  1), (3,  2), (11, 1), (6, 3), (5,  1), (8, 11)]
+
+                # See discription above
+                state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1],
+                                [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1],
+                                [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1],
+                                [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+                #This converts the state_arrays into the shift encoded UInt64
+                class_row = zeros(UInt64, num_classes)
+                for i = 1:num_classes
+                    row = UInt64(0)
+                    for j in 1:num_states
+                        to_shift = UInt8((state_arrays[i][j]) * bit_per_state)
+                        row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state))
+                    end
+                    class_row[i]=row
+                end
+                print("\nconst _UTF8_DFA_TABLE = [\n")
+                for (class, repeats) in class_repeats
+                    print("    fill(UInt64($(class_row[class+1])), $repeats);\n")
+                end
+                print("    ]\n")
+            end
+=#
+# This table will be filled with 256 UInt64 representing the DFA transitions for all bytes
 const _UTF8_DFA_TABLE = [
     fill(UInt64(109802048057794944), 128);
     fill(UInt64(113232530780455302), 16);
@@ -208,7 +293,7 @@ const _UTF8_DFA_TABLE = [
     fill(UInt64(109802048057794986), 3);
     fill(UInt64(109802048057794992), 1);
     fill(UInt64(109802048057794950), 11)
-    ]::Vector{UInt64}
+    ]
 
 const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
@@ -252,10 +337,7 @@ function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{
     bytes = unsafe_wrap(Vector{UInt8}, s)
     isvalid(String,bytes)
 end
-function isvalid(::Type{String}, bytes::Vector{UInt8}) 
-    valid = _isvalid_utf8(bytes)
-    return ifelse(valid, true, false)
-end
+isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes)
 
 isvalid(s::String) = isvalid(String, s)
 

From 0c4b348234fe9b4f8da693a4ed5af65f8d9fbff1 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 18:24:59 -0500
Subject: [PATCH 08/34] Comment

---
 base/strings/string.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index f3b62d82825cd..ba16561415da7 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -299,10 +299,8 @@ const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of a
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
 
 # This function is designed so that you could use it on strings with discontinous memmory layouts
-#   by only feeding it contiguous block and keeping track of the state inbetween. 
+# by only feeding it contiguous block and keeping track of the state inbetween. 
 # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was.
-# For a contiguous bytestream other states are valid other than _UTF8_DFA_ACCEPT aslong as you aren't
-#  at the begining or end
 function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
     f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1]
     op(s, byte_dfa) = byte_dfa >> (s & UInt64(63))

From b6b25c7c6248860902608110ca38997d13cfa7d7 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 13 Dec 2022 18:56:51 -0500
Subject: [PATCH 09/34] whitespaces

---
 base/strings/string.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index ba16561415da7..a1743929b6f80 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -216,7 +216,7 @@ end
                 9 | 1  1  0  2  1  2  3  3  1  1
                10 | 1  1  1  1  1  1  1  1  1  1
                11 | 6  1  1  1  1  1  1  1  1  1
-    
+
     Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that
     it contains the number of bit needed to shift the state it is transitioning to shifted into
     the position of the current state.
@@ -229,7 +229,7 @@ end
     UInt64(113232530780455302) =   0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110
 
     Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30
-    so when the next character class is 7 row is in row::UInt64: 
+    so when the next character class is 7 row is in row::UInt64:
             The reduction operation:
                 state =  byte_dfa         >>                        (state & UInt64(63))
                         | Shift to get the next state shift  | Mask first 6 bits of starting state to get the current shift ie 30
@@ -299,13 +299,13 @@ const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of a
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
 
 # This function is designed so that you could use it on strings with discontinous memmory layouts
-# by only feeding it contiguous block and keeping track of the state inbetween. 
+# by only feeding it contiguous block and keeping track of the state inbetween.
 # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was.
 function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
     f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1]
     op(s, byte_dfa) = byte_dfa >> (s & UInt64(63))
     final_state = mapfoldl(f, op, bytes, init = state)
-    return (final_state & UInt64(63)) 
+    return (final_state & UInt64(63))
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
@@ -331,7 +331,7 @@ function byte_string_classify(bytes::Vector{UInt8})
     return ifelse(valid, 2, 0)
 end
 
-function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) 
+function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String})
     bytes = unsafe_wrap(Vector{UInt8}, s)
     isvalid(String,bytes)
 end

From a019b68bd9c053271c4fd45ef8094bbe2cc78815 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Wed, 14 Dec 2022 09:40:22 -0500
Subject: [PATCH 10/34] Additional state comments

---
 base/strings/string.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index a1743929b6f80..958d2eca13167 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -196,11 +196,13 @@ end
     The UTF-8 Validation is performed by a shift based DFA.
     Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 
-        Important States
+        Validation States
             0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well
                         ASCII only strings will never leave this state
             1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not
-            2 -> This is the state before the last byte of a multibyte character is read
+            2 -> One valid continuation byte needed to return to state 0
+        3,4,5 -> Two valid continuation bytes needed to return to state 0
+        6,7,8 -> Three valids continuation bytes needed to return to state 0
             9 -> Not important and not used which is why it is all ones
                         Current State
                     0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲

From 5d808260a8e500ba5dba52c7029ed75da28bd15c Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Sun, 18 Dec 2022 17:11:54 -0500
Subject: [PATCH 11/34] Fix Comment

---
 base/strings/string.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 958d2eca13167..8429642e17dca 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -211,12 +211,12 @@ end
                 2 | 2  1  1  1  1  1  1  1  1  1
                 3 | 3  1  1  1  1  1  1  1  1  1
                 4 | 5  1  1  1  1  1  1  1  1  1
-    Character   5 | 8  1  1  1  1  1  1  1  1  1
+    Character   5 | 8  1  1  1  1  1  1  1  1  1     <- Next State
     Class       6 | 7  1  1  1  1  1  1  1  1  1
                 7 | 1  1  0  2  2  1  3  3  1  1
                 8 | 1  1  1  1  1  1  1  1  1  1
                 9 | 1  1  0  2  1  2  3  3  1  1
-               10 | 1  1  1  1  1  1  1  1  1  1
+               10 | 4  1  1  1  1  1  1  1  1  1
                11 | 6  1  1  1  1  1  1  1  1  1
 
     Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that
@@ -260,7 +260,7 @@ end
                                 [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1],
                                 [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                                 [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1],
-                                [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                                 [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
                 #This converts the state_arrays into the shift encoded UInt64
                 class_row = zeros(UInt64, num_classes)

From ffd8a1a224930ec6bdeb313e21b6abbfaccc8f10 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Sun, 18 Dec 2022 18:07:11 -0500
Subject: [PATCH 12/34] Change table definition to let block

---
 base/strings/string.jl | 68 ++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 8429642e17dca..a951f53e7f611 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -280,22 +280,58 @@ end
             end
 =#
 # This table will be filled with 256 UInt64 representing the DFA transitions for all bytes
-const _UTF8_DFA_TABLE = [
-    fill(UInt64(109802048057794944), 128);
-    fill(UInt64(113232530780455302), 16);
-    fill(UInt64(109855655693648262), 16);
-    fill(UInt64(109855649351860614), 32);
-    fill(UInt64(109802048057794950), 2);
-    fill(UInt64(109802048057794956), 30);
-    fill(UInt64(109802048057794968), 1);
-    fill(UInt64(109802048057794962), 12);
-    fill(UInt64(109802048057794974), 1);
-    fill(UInt64(109802048057794962), 2);
-    fill(UInt64(109802048057794980), 1);
-    fill(UInt64(109802048057794986), 3);
-    fill(UInt64(109802048057794992), 1);
-    fill(UInt64(109802048057794950), 11)
-    ]
+const _UTF8_DFA_TABLE = let
+    num_classes=12
+    num_states=10
+    bit_per_state = 6
+    # class_repeats represents the 256 byte's classes by storing the (class, #of repeats)
+    class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1),
+                    (3,  12), (4,  1), (3,  2), (11, 1), (6, 3), (5,  1), (8, 11)]
+
+    # These are the rows discussed in comments above
+    state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1],
+                    [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1],
+                    [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1],
+                    [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+
+    #This converts the state_arrays into the shift encoded UInt64
+    class_row = zeros(UInt64, num_classes)
+    for i = 1:num_classes
+        row = UInt64(0)
+        for j in 1:num_states
+            #Calculate the shift required for the next state
+            to_shift = UInt8((state_arrays[i][j]) * bit_per_state)
+            #Shift the next state into the position of the current state
+            row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state))
+        end
+        class_row[i]=row
+    end
+    mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats)
+end
+# const _UTF8_DFA_TABLE = [
+#     fill(UInt64(109802048057794944), 128);
+#     fill(UInt64(113232530780455302), 16);
+#     fill(UInt64(109855655693648262), 16);
+#     fill(UInt64(109855649351860614), 32);
+#     fill(UInt64(109802048057794950), 2);
+#     fill(UInt64(109802048057794956), 30);
+#     fill(UInt64(109802048057794968), 1);
+#     fill(UInt64(109802048057794962), 12);
+#     fill(UInt64(109802048057794974), 1);
+#     fill(UInt64(109802048057794962), 2);
+#     fill(UInt64(109802048057794980), 1);
+#     fill(UInt64(109802048057794986), 3);
+#     fill(UInt64(109802048057794992), 1);
+#     fill(UInt64(109802048057794950), 11)
+#     ]
 
 const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop

From 4530d895759773d8600072b7b987c5e1eb326bf0 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Sun, 18 Dec 2022 22:26:00 -0500
Subject: [PATCH 13/34] Build table with let block

---
 base/strings/string.jl | 62 +++---------------------------------------
 1 file changed, 4 insertions(+), 58 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index a951f53e7f611..edb30d458e9ba 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -238,49 +238,10 @@ end
             Would result in the state being 2 which is a shift of 12:
                 state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30
                 state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100|
-
-    The code below will create the _UTF8_DFA_TABLE to be pasted in source.
-    It is included here in an effort to document a contrived process.
-    Do Not Uncomment the code below in this file it should be pasted into REPL
-
-            function build_utf8_validation_statemachine_table(; num_classes=12, num_states=10, bit_per_state = 6)
-
-                # class_repeats represents the 256 byte's classes by storing the (class, #of repeats)
-                class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1),
-                                (3,  12), (4,  1), (3,  2), (11, 1), (6, 3), (5,  1), (8, 11)]
-
-                # See discription above
-                state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1],
-                                [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1],
-                                [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1],
-                                [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-                #This converts the state_arrays into the shift encoded UInt64
-                class_row = zeros(UInt64, num_classes)
-                for i = 1:num_classes
-                    row = UInt64(0)
-                    for j in 1:num_states
-                        to_shift = UInt8((state_arrays[i][j]) * bit_per_state)
-                        row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state))
-                    end
-                    class_row[i]=row
-                end
-                print("\nconst _UTF8_DFA_TABLE = [\n")
-                for (class, repeats) in class_repeats
-                    print("    fill(UInt64($(class_row[class+1])), $repeats);\n")
-                end
-                print("    ]\n")
-            end
 =#
-# This table will be filled with 256 UInt64 representing the DFA transitions for all bytes
-const _UTF8_DFA_TABLE = let
+
+# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
+const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
     num_classes=12
     num_states=10
     bit_per_state = 6
@@ -316,22 +277,7 @@ const _UTF8_DFA_TABLE = let
     end
     mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats)
 end
-# const _UTF8_DFA_TABLE = [
-#     fill(UInt64(109802048057794944), 128);
-#     fill(UInt64(113232530780455302), 16);
-#     fill(UInt64(109855655693648262), 16);
-#     fill(UInt64(109855649351860614), 32);
-#     fill(UInt64(109802048057794950), 2);
-#     fill(UInt64(109802048057794956), 30);
-#     fill(UInt64(109802048057794968), 1);
-#     fill(UInt64(109802048057794962), 12);
-#     fill(UInt64(109802048057794974), 1);
-#     fill(UInt64(109802048057794962), 2);
-#     fill(UInt64(109802048057794980), 1);
-#     fill(UInt64(109802048057794986), 3);
-#     fill(UInt64(109802048057794992), 1);
-#     fill(UInt64(109802048057794950), 11)
-#     ]
+
 
 const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop

From a0f6a1c36e31ca557f3a3632dfd0ae30fa208f16 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Fri, 6 Jan 2023 17:04:14 -0500
Subject: [PATCH 14/34] @stevenjg recommendations

---
 base/strings/string.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index edb30d458e9ba..69f93edacd7f8 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -298,7 +298,8 @@ function _isvalid_utf8(bytes::Vector{UInt8})
     return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT
 end
 
-_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
+_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
+    GC.@preserve s _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
 
 # Classifcations of string
     # 0: neither valid ASCII nor UTF-8
@@ -306,7 +307,7 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
     # 2: valid UTF-8
 function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}})
     bytes = unsafe_wrap(Vector{UInt8}, s)
-    byte_string_classify(bytes, kwargs...)
+    GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s))
 end
 
 function byte_string_classify(bytes::Vector{UInt8})
@@ -316,8 +317,7 @@ function byte_string_classify(bytes::Vector{UInt8})
 end
 
 function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String})
-    bytes = unsafe_wrap(Vector{UInt8}, s)
-    isvalid(String,bytes)
+    GC.@preserve s isvalid(String,unsafe_wrap(Vector{UInt8}, s))
 end
 isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes)
 

From 24d45d40271c6aab1693a361ab12c2c98462516e Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Fri, 6 Jan 2023 17:38:15 -0500
Subject: [PATCH 15/34] fix

---
 base/strings/string.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 69f93edacd7f8..9d98c5ed8632f 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -306,7 +306,6 @@ _isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
     # 1: valid ASCII
     # 2: valid UTF-8
 function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}})
-    bytes = unsafe_wrap(Vector{UInt8}, s)
     GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s))
 end
 

From b11c1ef41cfb4c838a7e3661d04186f25cde7ca7 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 9 Jan 2023 16:35:48 -0500
Subject: [PATCH 16/34] Switch to AbstractVector

---
 base/strings/string.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 9d98c5ed8632f..92436ee62d04f 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -285,7 +285,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state
 # This function is designed so that you could use it on strings with discontinous memmory layouts
 # by only feeding it contiguous block and keeping track of the state inbetween.
 # Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was.
-function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
+@propagate_inbounds function _isvalid_utf8_dfa(bytes::AbstractVector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
     f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1]
     op(s, byte_dfa) = byte_dfa >> (s & UInt64(63))
     final_state = mapfoldl(f, op, bytes, init = state)
@@ -293,7 +293,7 @@ function _isvalid_utf8_dfa(bytes::Vector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
-function _isvalid_utf8(bytes::Vector{UInt8})
+function _isvalid_utf8(bytes::AbstractVector{UInt8})
     final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT)
     return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT
 end

From 54d49fb26c83948122a0315d2cb2b572f662cd83 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 9 Jan 2023 18:58:47 -0500
Subject: [PATCH 17/34] Switch to codeunits

---
 base/strings/string.jl    | 15 +++++----------
 base/strings/substring.jl |  8 ++++----
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 92436ee62d04f..6aa47f189e788 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -298,16 +298,14 @@ function _isvalid_utf8(bytes::AbstractVector{UInt8})
     return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT
 end
 
-_isvalid_utf8(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
-    GC.@preserve s _isvalid_utf8(unsafe_wrap(Vector{UInt8}, s))
+_isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
 
 # Classifcations of string
     # 0: neither valid ASCII nor UTF-8
     # 1: valid ASCII
     # 2: valid UTF-8
-function byte_string_classify(s::Union{String,FastContiguousSubArray{UInt8,1,Vector{UInt8}}})
-    GC.@preserve s byte_string_classify(unsafe_wrap(Vector{UInt8}, s))
-end
+ byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
+
 
 function byte_string_classify(bytes::Vector{UInt8})
     all(c -> iszero(c & 0x80), bytes) && return 1
@@ -315,12 +313,9 @@ function byte_string_classify(bytes::Vector{UInt8})
     return ifelse(valid, 2, 0)
 end
 
-function isvalid(::Type{String}, s::Union{FastContiguousSubArray{UInt8,1,Vector{UInt8}},String})
-    GC.@preserve s isvalid(String,unsafe_wrap(Vector{UInt8}, s))
-end
-isvalid(::Type{String}, bytes::Vector{UInt8}) = @inline _isvalid_utf8(bytes)
+isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes)
 
-isvalid(s::String) = isvalid(String, s)
+isvalid(s::AbstractString) = isvalid(String, codeunits(s))
 
 is_valid_continuation(c) = c & 0xc0 == 0x80
 
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index ea132402447be..68a0e7c7c4165 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -100,11 +100,11 @@ function isvalid(s::SubString, i::Integer)
     @inbounds return ib && isvalid(s.string, s.offset + i)::Bool
 end
 
-byte_string_classify(s::SubString{String}) =
-    ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
+# byte_string_classify(s::SubString{String}) =
+#     ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
 
-isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0
-isvalid(s::SubString{String}) = isvalid(String, s)
+# isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0
+# isvalid(s::SubString{String}) = isvalid(String, s)
 
 thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
 nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)

From f853b463771e7de09f47e9d26983631ea0e094a9 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 9 Jan 2023 19:29:03 -0500
Subject: [PATCH 18/34] Fix

---
 base/strings/string.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 6aa47f189e788..7f8a15cba5622 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -314,6 +314,7 @@ function byte_string_classify(bytes::Vector{UInt8})
 end
 
 isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes)
+isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s))
 
 isvalid(s::AbstractString) = isvalid(String, codeunits(s))
 

From be9802209e1765728bdd2508d2b7293af4dbd70f Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 10 Jan 2023 16:25:32 -0500
Subject: [PATCH 19/34] Change order of operations

---
 base/strings/string.jl | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 7f8a15cba5622..dd26dcb504383 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -282,21 +282,18 @@ end
 const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
 const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
 
-# This function is designed so that you could use it on strings with discontinous memmory layouts
-# by only feeding it contiguous block and keeping track of the state inbetween.
-# Furthermore you could check in returned value is _UTF8_DFA_INVALID and stop as invalid if it was.
-@propagate_inbounds function _isvalid_utf8_dfa(bytes::AbstractVector{UInt8},state::UInt64 = _UTF8_DFA_ACCEPT)
-    f(byte) = @inbounds _UTF8_DFA_TABLE[byte+1]
-    op(s, byte_dfa) = byte_dfa >> (s & UInt64(63))
-    final_state = mapfoldl(f, op, bytes, init = state)
-    return (final_state & UInt64(63))
+# The dfa step is broken out so that it may be used in other functions
+@inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63)
+
+@inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes))
+    for i = first:last
+       @inbounds state = _utf_dfa_step(state, bytes[i])
+   end
+   return (state)
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
-function _isvalid_utf8(bytes::AbstractVector{UInt8})
-    final_state = _isvalid_utf8_dfa(bytes, _UTF8_DFA_ACCEPT)
-    return (final_state & UInt64(63)) == _UTF8_DFA_ACCEPT
-end
+_isvalid_utf8(bytes::AbstractVector{UInt8}) = isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
 
 _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
 

From d54656dc76c656111c2a2b96973863d2044fa671 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 10 Jan 2023 17:00:11 -0500
Subject: [PATCH 20/34] fix

---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index dd26dcb504383..ff2784e62db17 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -293,7 +293,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
-_isvalid_utf8(bytes::AbstractVector{UInt8}) = isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
+_isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
 
 _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
 

From c40daff66743855dd30bbc86dc3569daa3a87f48 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 10 Jan 2023 18:15:37 -0500
Subject: [PATCH 21/34] Add inlining & fix comments

---
 base/strings/string.jl | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index ff2784e62db17..3f2696ee25a5e 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -223,7 +223,7 @@ end
     it contains the number of bit needed to shift the state it is transitioning to shifted into
     the position of the current state.
 
-    Example: character class 1 is encoded in below
+    Example: character class 1 is encoded as below
                     Current State        |    9 |    8 |    7 |    6 |    5 |    4 |    3 |    2 |    1 |    0 |
                     Next State           |    1 |    3 |    3 |    1 |    2 |    1 |    2 |    0 |    1 |    1 |
                     Shift required       |  6*1 |  6*3 |  6*3 |  6*1 |  6*2 |  6*1 |  6*2 |  6*0 |  6*1 |  6*1 |
@@ -233,11 +233,12 @@ end
     Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30
     so when the next character class is 7 row is in row::UInt64:
             The reduction operation:
-                state =  byte_dfa         >>                        (state & UInt64(63))
-                        | Shift to get the next state shift  | Mask first 6 bits of starting state to get the current shift ie 30
+                state =  (   byte_dfa >>  state )            & UInt64(63)
+                        | Shift to get the next state shift  | Mask the first six bits so that the new state is represended by the shift
             Would result in the state being 2 which is a shift of 12:
-                state = 0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 >> 30
-                state = 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100|
+                (byte_dfa    =  0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 
+                >> 30    )   => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100
+                & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100
 =#
 
 # Fill the table with 256 UInt64 representing the DFA transitions for all bytes
@@ -293,9 +294,9 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
-_isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
+@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
 
-_isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
+@inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
 
 # Classifcations of string
     # 0: neither valid ASCII nor UTF-8

From 394c4fa96410438ceee669f4e248ebf330f6db7b Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 10 Jan 2023 18:37:23 -0500
Subject: [PATCH 22/34] Agressive inlining

---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 3f2696ee25a5e..5595420ed8373 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -314,7 +314,7 @@ end
 isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes)
 isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s))
 
-isvalid(s::AbstractString) = isvalid(String, codeunits(s))
+@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
 
 is_valid_continuation(c) = c & 0xc0 == 0x80
 

From a0cdd13df79a7488cda57b3c777b7e7f7915ced4 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Wed, 11 Jan 2023 07:49:37 -0500
Subject: [PATCH 23/34] Whitespace

---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 5595420ed8373..9f67616f243a5 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -236,7 +236,7 @@ end
                 state =  (   byte_dfa >>  state )            & UInt64(63)
                         | Shift to get the next state shift  | Mask the first six bits so that the new state is represended by the shift
             Would result in the state being 2 which is a shift of 12:
-                (byte_dfa    =  0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110 
+                (byte_dfa    =  0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110
                 >> 30    )   => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100
                 & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100
 =#

From a2691af06ad22651baea7f9e98616c44db0ec837 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Wed, 11 Jan 2023 14:04:52 -0500
Subject: [PATCH 24/34] Remove Commented Code

---
 base/strings/substring.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index 68a0e7c7c4165..5ba08ac2f7fff 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -100,12 +100,6 @@ function isvalid(s::SubString, i::Integer)
     @inbounds return ib && isvalid(s.string, s.offset + i)::Bool
 end
 
-# byte_string_classify(s::SubString{String}) =
-#     ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
-
-# isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0
-# isvalid(s::SubString{String}) = isvalid(String, s)
-
 thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
 nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)
 

From 7dc6a685611b372c339f8dc16ab709340433c24f Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Thu, 12 Jan 2023 12:57:44 -0500
Subject: [PATCH 25/34] Fix Comments

---
 base/strings/string.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 9f67616f243a5..d5fde47f4c046 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -199,11 +199,12 @@ end
         Validation States
             0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well
                         ASCII only strings will never leave this state
-            1 -> UTF8_INVALID is only reached by invalid bytes and once in this state will not
+            1 -> UTF8_INVALID is only reached by invalid bytes and once in this state it will not change
+                    as seen by all 1s in that column of table below
             2 -> One valid continuation byte needed to return to state 0
         3,4,5 -> Two valid continuation bytes needed to return to state 0
         6,7,8 -> Three valids continuation bytes needed to return to state 0
-            9 -> Not important and not used which is why it is all ones
+            9 -> Not used which is why it always transitions to state 1
                         Current State
                     0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
                 0 | 0  1  1  1  1  1  1  1  1  1
@@ -289,8 +290,8 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state
 @inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes))
     for i = first:last
        @inbounds state = _utf_dfa_step(state, bytes[i])
-   end
-   return (state)
+    end
+    return (state)
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block

From b4465972e687d6f960cfbda1cf5dceb36b3c3b3c Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Sat, 4 Feb 2023 13:39:29 -0500
Subject: [PATCH 26/34] Changed DFA to track isascii & added state diagram

---
 base/strings/string.jl | 134 ++++++++++++++++++++++++++---------------
 1 file changed, 86 insertions(+), 48 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index d5fde47f4c046..b1496628b597a 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -194,31 +194,53 @@ end
 ## checking UTF-8 & ACSII validity ##
 #=
     The UTF-8 Validation is performed by a shift based DFA.
-    Using the state machine diagram found @ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    ┌───────────────────────────────────────────────────────────────────┐
+    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
+    │                               ├────────3────────┐           │     │
+    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
+    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
+    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
+    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
+    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
+    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
+    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
+    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
+    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
+    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
+    │                      │        │     ├─┤               │        │  │
+    │                      │        └─4──►│6├─────1,9───────┘        │  │
+    │          INVALID     │              └─┘                        │  │
+    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
+    │          ┌▼───┴┐                                                  │
+    │          │  2  ◄─── All undefined transitions result in state 2   │
+    │          └─────┘                                                  │
+    └───────────────────────────────────────────────────────────────────┘
 
         Validation States
-            0 -> UTF8_ACCEPT is the start state and represents a complete UTF-8 String as well
+            1 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
+                        If the DFA ends in this state the string is ASCII only
+            1 -> _UTF8_DFA_ACCEPT is the start state and represents a complete UTF-8 String as well
                         ASCII only strings will never leave this state
-            1 -> UTF8_INVALID is only reached by invalid bytes and once in this state it will not change
+            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
                     as seen by all 1s in that column of table below
-            2 -> One valid continuation byte needed to return to state 0
-        3,4,5 -> Two valid continuation bytes needed to return to state 0
-        6,7,8 -> Three valids continuation bytes needed to return to state 0
-            9 -> Not used which is why it always transitions to state 1
+            3 -> One valid continuation byte needed to return to state 0
+        4,5,6 -> Two valid continuation bytes needed to return to state 0
+        7,8,9 -> Three valids continuation bytes needed to return to state 0
+
                         Current State
                     0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
-                0 | 0  1  1  1  1  1  1  1  1  1
-                1 | 1  1  0  2  1  2  1  3  3  1
-                2 | 2  1  1  1  1  1  1  1  1  1
-                3 | 3  1  1  1  1  1  1  1  1  1
-                4 | 5  1  1  1  1  1  1  1  1  1
-    Character   5 | 8  1  1  1  1  1  1  1  1  1     <- Next State
-    Class       6 | 7  1  1  1  1  1  1  1  1  1
-                7 | 1  1  0  2  2  1  3  3  1  1
-                8 | 1  1  1  1  1  1  1  1  1  1
-                9 | 1  1  0  2  1  2  3  3  1  1
-               10 | 4  1  1  1  1  1  1  1  1  1
-               11 | 6  1  1  1  1  1  1  1  1  1
+                0 | 0  1  2  2  2  2  2  2  2  2
+                1 | 2  2  2  1  3  2  3  2  4  4
+                2 | 3  3  2  2  2  2  2  2  2  2
+                3 | 4  4  2  2  2  2  2  2  2  2
+                4 | 6  6  2  2  2  2  2  2  2  2
+    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
+    Class       6 | 8  8  2  2  2  2  2  2  2  2
+                7 | 2  2  2  1  3  3  2  4  4  2
+                8 | 2  2  2  2  2  2  2  2  2  2
+                9 | 2  2  2  1  3  2  3  4  4  2
+               10 | 5  5  2  2  2  2  2  2  2  2
+               11 | 7  7  2  2  2  2  2  2  2  2
 
     Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that
     it contains the number of bit needed to shift the state it is transitioning to shifted into
@@ -226,19 +248,19 @@ end
 
     Example: character class 1 is encoded as below
                     Current State        |    9 |    8 |    7 |    6 |    5 |    4 |    3 |    2 |    1 |    0 |
-                    Next State           |    1 |    3 |    3 |    1 |    2 |    1 |    2 |    0 |    1 |    1 |
-                    Shift required       |  6*1 |  6*3 |  6*3 |  6*1 |  6*2 |  6*1 |  6*2 |  6*0 |  6*1 |  6*1 |
-                                         |    6 |   18 |   18 |    6 |   12 |    6 |   12 |    0 |    6 |    6 |
-    UInt64(113232530780455302) =   0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110
+                    Next State           |    4 |    4 |    2 |    3 |    2 |    3 |    1 |    2 |    2 |    2 |
+                    Shift required       |  6*4 |  6*4 |  6*2 |  6*3 |  6*2 |  6*3 |  6*1 |  6*2 |  6*2 |  6*2 |
+                                         |   24 |   24 |   12 |   18 |   12 |   18 |    6 |   12 |   12 |   12 |
+    UInt64(0x061831231218c30c) =   0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100
 
     Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30
-    so when the next character class is 7 row is in row::UInt64:
+    so when the next character class is 1 the new state is obtained by the following operations:
             The reduction operation:
                 state =  (   byte_dfa >>  state )            & UInt64(63)
                         | Shift to get the next state shift  | Mask the first six bits so that the new state is represended by the shift
             Would result in the state being 2 which is a shift of 12:
-                (byte_dfa    =  0b0000|000110|010010|010010|000110|001100|000110|001100|000000|000110|000110
-                >> 30    )   => 0b0000|000000|000000|000000|000000|000000|000110|010010|010010|000110|001100
+                (byte_dfa    =  0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100
+                >> 30    )   => 0b0000|000000|000000|000000|000000|000000|011000|011000|001100|010010|001100
                 & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100
 =#
 
@@ -247,23 +269,37 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
     num_classes=12
     num_states=10
     bit_per_state = 6
-    # class_repeats represents the 256 byte's classes by storing the (class, #of repeats)
-    class_repeats = [ (0, 128), (1, 16), (9, 16), (7, 32), (8, 2), (2, 30), (10, 1),
-                    (3,  12), (4,  1), (3,  2), (11, 1), (6, 3), (5,  1), (8, 11)]
+
+    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
+                           11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
 
     # These are the rows discussed in comments above
-    state_arrays = [[ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 1, 1, 0, 2, 1, 2, 1, 3, 3, 1],
-                    [ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 7, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 1, 1, 0, 2, 2, 1, 3, 3, 1, 1],
-                    [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 1, 1, 0, 2, 1, 2, 3, 3, 1, 1],
-                    [ 4, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [ 6, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+    state_arrays = [[ 0  1  2  2  2  2  2  2  2  2],
+                    [ 2  2  2  1  3  2  3  2  4  4],
+                    [ 3  3  2  2  2  2  2  2  2  2],
+                    [ 4  4  2  2  2  2  2  2  2  2],
+                    [ 6  6  2  2  2  2  2  2  2  2],
+                    [ 9  9  2  2  2  2  2  2  2  2],
+                    [ 8  8  2  2  2  2  2  2  2  2],
+                    [ 2  2  2  1  3  3  2  4  4  2],
+                    [ 2  2  2  2  2  2  2  2  2  2],
+                    [ 2  2  2  1  3  2  3  4  4  2],
+                    [ 5  5  2  2  2  2  2  2  2  2],
+                    [ 7  7  2  2  2  2  2  2  2  2]]
 
     #This converts the state_arrays into the shift encoded UInt64
     class_row = zeros(UInt64, num_classes)
@@ -277,12 +313,13 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
         end
         class_row[i]=row
     end
-    mapreduce(t->fill(class_row[t[1]+1],t[2]),vcat,class_repeats)
+    map(c->class_row[c+1],character_classes)
 end
 
 
-const _UTF8_DFA_ACCEPT = UInt64(0) #This state represents the start and end of any valid string
-const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state just stop
+const _UTF8_DFA_ASCII = UInt64(0) #This state represents the start and end of any valid string
+const _UTF8_DFA_ACCEPT = UInt64(6) #This state represents the start and end of any valid string
+const _UTF8_DFA_INVALID = UInt64(12) # If the state machine is ever in this state just stop
 
 # The dfa step is broken out so that it may be used in other functions
 @inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63)
@@ -295,7 +332,7 @@ const _UTF8_DFA_INVALID = UInt64(6) # If the state machine is ever in this state
 end
 
 # This is a shift based utf-8 DFA that works on string that are a contiguous block
-@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ACCEPT, bytes) == _UTF8_DFA_ACCEPT
+@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) <= _UTF8_DFA_ACCEPT # <= covers _UTF8_DFA_ASCII as well
 
 @inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
 
@@ -307,9 +344,10 @@ end
 
 
 function byte_string_classify(bytes::Vector{UInt8})
-    all(c -> iszero(c & 0x80), bytes) && return 1
-    valid = _isvalid_utf8(bytes)
-    return ifelse(valid, 2, 0)
+    state = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes)
+    state ==  _UTF8_DFA_ASCII && return 1
+    state ==  _UTF8_DFA_ACCEPT && return 2
+    return 0
 end
 
 isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes)

From b989d0d5683ad540b84efff80b2acd6394e5995a Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Sat, 4 Feb 2023 14:25:37 -0500
Subject: [PATCH 27/34] Fix states discription

---
 base/strings/string.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index b1496628b597a..c08f94efdee1e 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -217,10 +217,9 @@ end
     └───────────────────────────────────────────────────────────────────┘
 
         Validation States
-            1 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
+            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
                         If the DFA ends in this state the string is ASCII only
-            1 -> _UTF8_DFA_ACCEPT is the start state and represents a complete UTF-8 String as well
-                        ASCII only strings will never leave this state
+            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
             2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
                     as seen by all 1s in that column of table below
             3 -> One valid continuation byte needed to return to state 0

From 27952508fce1a39096db08990032f505ac7c540f Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 6 Feb 2023 10:57:12 -0500
Subject: [PATCH 28/34] Add tests to validate DFA

---
 test/strings/basic.jl | 152 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index e1d6e9dd60491..7ce2b5fdc3b39 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -1234,3 +1234,155 @@ end
     end
     @test_throws ArgumentError Symbol("a\0a")
 end
+
+@testset "Ensure UTF-8 DFA can never leave invalid state" begin
+    for b = typemin(UInt8):typemax(UInt8)
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID
+    end
+end
+@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin
+    for b = 0x00:0x7F
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
+    end
+end
+
+@testset "Validate UTF-8 DFA" begin
+    # Unicode 15
+    # Table 3-7. Well-Formed UTF-8 Byte Sequences
+
+    table_rows = [  [0x00:0x7F],
+                    [0xC2:0xDF,0x80:0xBF],
+                    [0xE0:0xE0,0xA0:0xBF,0x80:0xBF],
+                    [0xE1:0xEC,0x80:0xBF,0x80:0xBF],
+                    [0xED:0xED,0x80:0x9F,0x80:0xBF],
+                    [0xEE:0xEF,0x80:0xBF,0x80:0xBF],
+                    [0xF0:0xF0,0x90:0xBF,0x80:0xBF,0x80:0xBF],
+                    [0xF1:0xF3,0x80:0xBF,0x80:0xBF,0x80:0xBF],
+                    [0xF4:0xF4,0x80:0x8F,0x80:0xBF,0x80:0xBF]]
+    invalid_first_bytes = union(0xC0:0xC1,0xF5:0xFF,0x80:0xBF)
+
+    valid_first_bytes = union(collect(first(r) for r in table_rows)...)
+
+
+
+    # Prove that the first byte sets in the table & invalid cover all bytes
+    @test length(union(valid_first_bytes,invalid_first_bytes)) == 256
+    @test length(intersect(valid_first_bytes,invalid_first_bytes)) == 0
+
+    #Check the ASCII range
+    for b = 0x00:0x7F
+        #Test from both UTF-8 state and ascii state
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_ACCEPT
+        @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
+    end
+
+    #Check the remaining first bytes
+    for b = 0x80:0xFF
+        if b ∈ invalid_first_bytes
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_INVALID
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_INVALID
+        else
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) != Base._UTF8_DFA_INVALID
+            @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) != Base._UTF8_DFA_INVALID
+        end
+    end
+
+    # Check two byte Sequences
+    for table_row in [table_rows[2]]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+        b1 = first(table_row[1])
+        #Prove that all valid second bytes return correct state
+        for b2 = table_row[2]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+    end
+
+    # Check three byte Sequences
+    for table_row in table_rows[3:6]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+
+        b1 = first(table_row[1])
+        b2 = first(table_row[2])
+        #Prove that all valid second bytes return same state
+        state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        for b2 = table_row[2]
+            @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+
+        b2 = first(table_row[2])
+        #Prove that all valid third bytes return correct state
+        for b3 = table_row[3]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+        for b3 = setdiff(0x00:0xFF,table_row[3])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+    end
+
+    # Check Four byte Sequences
+    for table_row in table_rows[7:9]
+        b1 = first(table_row[1])
+        state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+        state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        @test state1 == state2
+        #Prove that all the first bytes in a row give same state
+        for b1 in table_row[1]
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
+            @test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
+        end
+
+        b1 = first(table_row[1])
+        b2 = first(table_row[2])
+        #Prove that all valid second bytes return same state
+        state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        for b2 = table_row[2]
+            @test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+        for b2 = setdiff(0x00:0xFF,table_row[2])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
+        end
+
+
+        b2 = first(table_row[2])
+        b3 = first(table_row[3])
+        state3 = Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        #Prove that all valid third bytes return same state
+        for b3 = table_row[3]
+            @test state3 == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+        for b3 = setdiff(0x00:0xFF,table_row[3])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
+        end
+
+        b3 = first(table_row[3])
+        #Prove that all valid forth bytes return correct state
+        for b4 = table_row[4]
+            @test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state3,[b4],1,1)
+        end
+        for b4 = setdiff(0x00:0xFF,table_row[4])
+            @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1)
+        end
+    end
+end
\ No newline at end of file

From bc0e662519f31eb5f121eed258b11b9d10d61b95 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 6 Feb 2023 14:39:27 -0500
Subject: [PATCH 29/34] Trailing newline

---
 test/strings/basic.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 7ce2b5fdc3b39..602c38551f6d8 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -1240,7 +1240,7 @@ end
         @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID
     end
 end
-@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin
+@testset "Ensure  UTF-8 DFA stays in ASCII State for all ASCII" begin
     for b = 0x00:0x7F
         @test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
     end
@@ -1385,4 +1385,4 @@ end
             @test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1)
         end
     end
-end
\ No newline at end of file
+end

From 557bda642b8db91f63305c9759470009c11e4b11 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 14 Feb 2023 11:27:20 -0500
Subject: [PATCH 30/34]  State to UInt32 & use SMTsolver derived shifts

---
 base/strings/string.jl | 63 ++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index c08f94efdee1e..dd845e5277866 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -241,34 +241,29 @@ end
                10 | 5  5  2  2  2  2  2  2  2  2
                11 | 7  7  2  2  2  2  2  2  2  2
 
-    Each character class row is encoding 10 states shift in 6 bits combined into a UInt64 such that
-    it contains the number of bit needed to shift the state it is transitioning to shifted into
-    the position of the current state.
-
-    Example: character class 1 is encoded as below
-                    Current State        |    9 |    8 |    7 |    6 |    5 |    4 |    3 |    2 |    1 |    0 |
-                    Next State           |    4 |    4 |    2 |    3 |    2 |    3 |    1 |    2 |    2 |    2 |
-                    Shift required       |  6*4 |  6*4 |  6*2 |  6*3 |  6*2 |  6*3 |  6*1 |  6*2 |  6*2 |  6*2 |
-                                         |   24 |   24 |   12 |   18 |   12 |   18 |    6 |   12 |   12 |   12 |
-    UInt64(0x061831231218c30c) =   0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100
-
-    Now if the current state was 5 the state::UInt64 would have the first 6 bit representing 5*6 = 30
-    so when the next character class is 1 the new state is obtained by the following operations:
-            The reduction operation:
-                state =  (   byte_dfa >>  state )            & UInt64(63)
-                        | Shift to get the next state shift  | Mask the first six bits so that the new state is represended by the shift
-            Would result in the state being 2 which is a shift of 12:
-                (byte_dfa    =  0b0000|011000|011000|001100|010010|001100|010010|000110|001100|001100|001100
-                >> 30    )   => 0b0000|000000|000000|000000|000000|000000|011000|011000|001100|010010|001100
-                & UInt64(63) => 0b0000|000000|000000|000000|000000|000000|000000|000000|000000|000000|001100
+           Shifts | 0  4 10 14 18 24  8 20 12 26
+
+    The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into
+    the rows the correct shift was a result.
+
+    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
+    the current state then masking the result with 0x11110 give the shift for the new state
+
+
 =#
 
+#State type used by UTF-8 DFA
+const _UTF8DFAState = UInt32
 # Fill the table with 256 UInt64 representing the DFA transitions for all bytes
 const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
     num_classes=12
     num_states=10
     bit_per_state = 6
 
+
+    # These shifts were derived using a SMT solver
+    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
+
     character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,8 +278,8 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
                             7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
                             8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                           10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
-                           11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
+                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
+                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
 
     # These are the rows discussed in comments above
     state_arrays = [[ 0  1  2  2  2  2  2  2  2  2],
@@ -300,30 +295,32 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
                     [ 5  5  2  2  2  2  2  2  2  2],
                     [ 7  7  2  2  2  2  2  2  2  2]]
 
-    #This converts the state_arrays into the shift encoded UInt64
-    class_row = zeros(UInt64, num_classes)
+    #This converts the state_arrays into the shift encoded _UTF8DFAState
+    class_row = zeros(_UTF8DFAState, num_classes)
+
     for i = 1:num_classes
-        row = UInt64(0)
+        row = _UTF8DFAState(0)
         for j in 1:num_states
             #Calculate the shift required for the next state
-            to_shift = UInt8((state_arrays[i][j]) * bit_per_state)
+            to_shift = UInt8((state_shifts[state_arrays[i][j]+1]) )
             #Shift the next state into the position of the current state
-            row = row | (UInt64(to_shift) << ((j - 1) * bit_per_state))
+            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
         end
         class_row[i]=row
     end
+
     map(c->class_row[c+1],character_classes)
 end
 
 
-const _UTF8_DFA_ASCII = UInt64(0) #This state represents the start and end of any valid string
-const _UTF8_DFA_ACCEPT = UInt64(6) #This state represents the start and end of any valid string
-const _UTF8_DFA_INVALID = UInt64(12) # If the state machine is ever in this state just stop
+const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
+const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
+const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
 
-# The dfa step is broken out so that it may be used in other functions
-@inline _utf_dfa_step(state::UInt64, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & UInt64(63)
+# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
+@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
 
-@inline function _isvalid_utf8_dfa(state::UInt64, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes))
+@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes))
     for i = first:last
        @inbounds state = _utf_dfa_step(state, bytes[i])
     end

From aaf886371f02fc0cdb77c1f0b1020470ce6b54ef Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Tue, 14 Mar 2023 18:21:57 -0400
Subject: [PATCH 31/34] Add Chunk based byte_string_classify

---
 base/strings/string.jl | 57 +++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index dd845e5277866..607ef8f1dfdb9 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -260,7 +260,6 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
     num_states=10
     bit_per_state = 6
 
-
     # These shifts were derived using a SMT solver
     state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
 
@@ -327,10 +326,20 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th
     return (state)
 end
 
-# This is a shift based utf-8 DFA that works on string that are a contiguous block
-@inline _isvalid_utf8(bytes::AbstractVector{UInt8}) = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes) <= _UTF8_DFA_ACCEPT # <= covers _UTF8_DFA_ASCII as well
-
-@inline _isvalid_utf8(s::AbstractString) = _isvalid_utf8(codeunits(s))
+@inline function _find_nonascii_chunk(cu::AbstractVector{UInt8}, first::Int, last::Int)
+    chunk_size = 256
+    epilog_bytes = rem(last - first + 1, chunk_size)
+    start = first
+    chunk_last = last - epilog_bytes
+    start > last && return nothing
+    for start = start:chunk_size:chunk_last
+        _isascii(cu, start, start + chunk_size - 1) || return start
+    end
+    start = chunk_last + 1
+    ((start <= last) && _isascii(cu, start, last)) || return start
+    return nothing
+end
+##
 
 # Classifcations of string
     # 0: neither valid ASCII nor UTF-8
@@ -339,15 +348,39 @@ end
  byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
 
 
-function byte_string_classify(bytes::Vector{UInt8})
-    state = _isvalid_utf8_dfa(_UTF8_DFA_ASCII, bytes)
-    state ==  _UTF8_DFA_ASCII && return 1
-    state ==  _UTF8_DFA_ACCEPT && return 2
-    return 0
+function byte_string_classify(bytes::AbstractVector{UInt8})
+    n = length(bytes)
+    start = _find_nonascii_chunk(bytes,1,n)
+    isnothing(start) && return 1
+
+    return _byte_string_classify_nonascii(bytes,start,n)
+end
+
+function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
+    chunk_size = 256
+
+    start = first
+    stop = min(last,first + chunk_size - 1)
+    state = _UTF8_DFA_ACCEPT
+    while start <= last
+        # Process non ascii chunk
+        state = _isvalid_utf8_dfa(state,bytes,start,stop)
+        state == _UTF8_DFA_INVALID && return 0
+
+        start = start + chunk_size
+        stop = min(last,stop + chunk_size)
+        # try to process ascii chunks
+        while state == _UTF8_DFA_ACCEPT
+            _isascii(bytes,start,stop) || break
+            (start = start + chunk_size) <= last || break
+            stop = min(last,stop + chunk_size)
+        end
+    end
+    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
 end
 
-isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = @inline _isvalid_utf8(bytes)
-isvalid(::Type{String}, s::AbstractString) = @inline _isvalid_utf8(codeunits(s))
+isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
+isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
 
 @inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
 

From 8006d60d896b374825887e1c026df51162bfb836 Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Thu, 23 Mar 2023 12:39:14 -0400
Subject: [PATCH 32/34] Chunk based processing

---
 base/strings/string.jl | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 607ef8f1dfdb9..e75d984739571 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -326,19 +326,17 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th
     return (state)
 end
 
-@inline function _find_nonascii_chunk(cu::AbstractVector{UInt8}, first::Int, last::Int)
-    chunk_size = 256
-    epilog_bytes = rem(last - first + 1, chunk_size)
-    start = first
-    chunk_last = last - epilog_bytes
-    start > last && return nothing
-    for start = start:chunk_size:chunk_last
-        _isascii(cu, start, start + chunk_size - 1) || return start
+@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
+    n=first
+    while n <= last - chunk_size
+        _isascii(cu,n,n+chunk_size-1) || return n
+        n += chunk_size
     end
-    start = chunk_last + 1
-    ((start <= last) && _isascii(cu, start, last)) || return start
+    n= last-chunk_size+1
+    _isascii(cu,n,last) || return n
     return nothing
 end
+
 ##
 
 # Classifcations of string
@@ -349,10 +347,16 @@ end
 
 
 function byte_string_classify(bytes::AbstractVector{UInt8})
+    chunk_size = 1024
+    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
     n = length(bytes)
-    start = _find_nonascii_chunk(bytes,1,n)
-    isnothing(start) && return 1
-
+    if n > chunk_threshold
+        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
+        isnothing(start) && return 1
+    else
+        _isascii(bytes,1,n) && return 1
+        start = 1
+    end
     return _byte_string_classify_nonascii(bytes,start,n)
 end
 
@@ -363,18 +367,18 @@ function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int
     stop = min(last,first + chunk_size - 1)
     state = _UTF8_DFA_ACCEPT
     while start <= last
-        # Process non ascii chunk
-        state = _isvalid_utf8_dfa(state,bytes,start,stop)
-        state == _UTF8_DFA_INVALID && return 0
-
-        start = start + chunk_size
-        stop = min(last,stop + chunk_size)
         # try to process ascii chunks
         while state == _UTF8_DFA_ACCEPT
             _isascii(bytes,start,stop) || break
             (start = start + chunk_size) <= last || break
             stop = min(last,stop + chunk_size)
         end
+        # Process non ascii chunk
+        state = _isvalid_utf8_dfa(state,bytes,start,stop)
+        state == _UTF8_DFA_INVALID && return 0
+
+        start = start + chunk_size
+        stop = min(last,stop + chunk_size)
     end
     return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
 end

From a6e338384757d13d42ef108b7a342566be7e8fa9 Mon Sep 17 00:00:00 2001
From: ndinsmore <45537276+ndinsmore@users.noreply.github.com>
Date: Fri, 7 Apr 2023 11:34:04 -0400
Subject: [PATCH 33/34] Update base/strings/string.jl

Co-authored-by: Steven G. Johnson <stevenj@mit.edu>
---
 base/strings/string.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index e75d984739571..a8d6907e9a78f 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -319,7 +319,7 @@ const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in th
 # The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
 @inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
 
-@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = 1, last::Int = length(bytes))
+@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
     for i = first:last
        @inbounds state = _utf_dfa_step(state, bytes[i])
     end

From d1a129c5e59cd581f0078e4bfb17557fb6fe197b Mon Sep 17 00:00:00 2001
From: Nicholas R Dinsmore <nicholas.dinsmore@gmail.com>
Date: Mon, 10 Apr 2023 11:49:26 -0400
Subject: [PATCH 34/34] Change State Arrays to a matrix

---
 base/strings/string.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index e75d984739571..6040c8254b33a 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -281,18 +281,18 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
                             11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
 
     # These are the rows discussed in comments above
-    state_arrays = [[ 0  1  2  2  2  2  2  2  2  2],
-                    [ 2  2  2  1  3  2  3  2  4  4],
-                    [ 3  3  2  2  2  2  2  2  2  2],
-                    [ 4  4  2  2  2  2  2  2  2  2],
-                    [ 6  6  2  2  2  2  2  2  2  2],
-                    [ 9  9  2  2  2  2  2  2  2  2],
-                    [ 8  8  2  2  2  2  2  2  2  2],
-                    [ 2  2  2  1  3  3  2  4  4  2],
-                    [ 2  2  2  2  2  2  2  2  2  2],
-                    [ 2  2  2  1  3  2  3  4  4  2],
-                    [ 5  5  2  2  2  2  2  2  2  2],
-                    [ 7  7  2  2  2  2  2  2  2  2]]
+    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  2  3  2  4  4;
+                     3  3  2  2  2  2  2  2  2  2;
+                     4  4  2  2  2  2  2  2  2  2;
+                     6  6  2  2  2  2  2  2  2  2;
+                     9  9  2  2  2  2  2  2  2  2;
+                     8  8  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  3  2  4  4  2;
+                     2  2  2  2  2  2  2  2  2  2;
+                     2  2  2  1  3  2  3  4  4  2;
+                     5  5  2  2  2  2  2  2  2  2;
+                     7  7  2  2  2  2  2  2  2  2]
 
     #This converts the state_arrays into the shift encoded _UTF8DFAState
     class_row = zeros(_UTF8DFAState, num_classes)
@@ -301,7 +301,7 @@ const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute bas
         row = _UTF8DFAState(0)
         for j in 1:num_states
             #Calculate the shift required for the next state
-            to_shift = UInt8((state_shifts[state_arrays[i][j]+1]) )
+            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
             #Shift the next state into the position of the current state
             row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
         end