Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
a823880
Working Native UTF-8 Validation
ndinsmore Dec 12, 2022
92a72d7
Comment fix
ndinsmore Dec 12, 2022
34a937d
Appears working
ndinsmore Dec 13, 2022
2affea1
Slight Fix and push for buildkite
ndinsmore Dec 13, 2022
21b52aa
Spit out statemachine and added comments
ndinsmore Dec 13, 2022
57a4d2a
Remove Fastpath f& simplify isvalid
ndinsmore Dec 13, 2022
877ba93
Minor fixes and mega comment on methodolgy
ndinsmore Dec 13, 2022
0c4b348
Comment
ndinsmore Dec 13, 2022
b6b25c7
whitespaces
ndinsmore Dec 13, 2022
a019b68
Additional state comments
ndinsmore Dec 14, 2022
5d80826
Fix Comment
ndinsmore Dec 18, 2022
ffd8a1a
Change table definition to let block
ndinsmore Dec 18, 2022
4530d89
Build table with let block
ndinsmore Dec 19, 2022
a0f6a1c
@stevenjg recommendations
ndinsmore Jan 6, 2023
24d45d4
fix
ndinsmore Jan 6, 2023
b11c1ef
Switch to AbstractVector
ndinsmore Jan 9, 2023
54d49fb
Switch to codeunits
ndinsmore Jan 9, 2023
f853b46
Fix
ndinsmore Jan 10, 2023
be98022
Change order of operations
ndinsmore Jan 10, 2023
d54656d
fix
ndinsmore Jan 10, 2023
c40daff
Add inlining & fix comments
ndinsmore Jan 10, 2023
394c4fa
Agressive inlining
ndinsmore Jan 10, 2023
a0cdd13
Whitespace
ndinsmore Jan 11, 2023
a2691af
Remove Commented Code
ndinsmore Jan 11, 2023
7dc6a68
Fix Comments
ndinsmore Jan 12, 2023
b446597
Changed DFA to track isascii & added state diagram
ndinsmore Feb 4, 2023
b989d0d
Fix states discription
ndinsmore Feb 4, 2023
2795250
Add tests to validate DFA
ndinsmore Feb 6, 2023
bc0e662
Trailing newline
ndinsmore Feb 6, 2023
557bda6
State to UInt32 & use SMTsolver derived shifts
ndinsmore Feb 14, 2023
aaf8863
Add Chunk based byte_string_classify
ndinsmore Mar 14, 2023
8006d60
Chunk based processing
ndinsmore Mar 23, 2023
cd467dd
Merge branch 'master' into native_utf8_validation
oscardssmith Apr 6, 2023
a6e3383
Update base/strings/string.jl
ndinsmore Apr 7, 2023
d1a129c
Change State Arrays to a matrix
ndinsmore Apr 10, 2023
331f05d
Merge branch 'native_utf8_validation' of github.com:ndinsmore/julia i…
ndinsmore Apr 10, 2023
661ec93
Merge branch 'master' into native_utf8_validation
oscardssmith Apr 10, 2023
d456837
Merge branch 'master' into native_utf8_validation
oscardssmith Apr 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 191 additions & 4 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ String(s::AbstractString) = print_to_string(s)
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))

unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))

Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
Expand Down Expand Up @@ -191,15 +192,201 @@ end
end

## checking UTF-8 & ACSII validity ##
#=
The UTF-8 Validation is performed by a shift based DFA.
┌───────────────────────────────────────────────────────────────────┐
│ UTF-8 DFA State Diagram ┌──────────────2──────────────┐ │
│ ├────────3────────┐ │ │
│ ┌──────────┐ │ ┌─┐ ┌▼┐ │ │
│ ASCII │ UTF-8 │ ├─5──►│9├───1────► │ │ │
│ │ │ │ ├─┤ │ │ ┌▼┐ │
│ │ ┌─0─┐ │ ├─6──►│8├─1,7,9──►4├──1,7,9──► │ │
│ ┌─0─┐ │ │ │ │ │ ├─┤ │ │ │ │ │
│ │ │ │ ┌▼───┴┐ │ ├─11─►│7├──7,9───► │ ┌───────►3├─┐ │
│ ┌▼───┴┐ │ │ │ ▼ │ └─┘ └─┘ │ │ │ │ │
│ │ 0 ├─────┘ │ 1 ├─► ──┤ │ ┌────► │ │ │
│ └─────┘ │ │ │ ┌─┐ │ │ └─┘ │ │
│ └──▲──┘ ├─10─►│5├─────7──────┘ │ │ │
│ │ │ ├─┤ │ │ │
│ │ └─4──►│6├─────1,9───────┘ │ │
│ INVALID │ └─┘ │ │
│ ┌─*─┐ └──────────────────1,7,9──────────────────┘ │
│ ┌▼───┴┐ │
│ │ 2 ◄─── All undefined transitions result in state 2 │
│ └─────┘ │
└───────────────────────────────────────────────────────────────────┘

Validation States
0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
If the DFA ends in this state the string is ASCII only
1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
as seen by all 1s in that column of table below
3 -> One valid continuation byte needed to return to state 0
4,5,6 -> Two valid continuation bytes needed to return to state 0
7,8,9 -> Three valids continuation bytes needed to return to state 0

Current State
0̲ 1̲ 2̲ 3̲ 4̲ 5̲ 6̲ 7̲ 8̲ 9̲
0 | 0 1 2 2 2 2 2 2 2 2
1 | 2 2 2 1 3 2 3 2 4 4
2 | 3 3 2 2 2 2 2 2 2 2
3 | 4 4 2 2 2 2 2 2 2 2
4 | 6 6 2 2 2 2 2 2 2 2
Character 5 | 9 9 2 2 2 2 2 2 2 2 <- Next State
Class 6 | 8 8 2 2 2 2 2 2 2 2
7 | 2 2 2 1 3 3 2 4 4 2
8 | 2 2 2 2 2 2 2 2 2 2
9 | 2 2 2 1 3 2 3 4 4 2
10 | 5 5 2 2 2 2 2 2 2 2
11 | 7 7 2 2 2 2 2 2 2 2

Shifts | 0 4 10 14 18 24 8 20 12 26

The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into
the rows the correct shift was a result.

Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
the current state then masking the result with 0x11110 give the shift for the new state


=#

#State type used by UTF-8 DFA
const _UTF8DFAState = UInt32
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
num_classes=12
num_states=10
bit_per_state = 6

# These shifts were derived using a SMT solver
state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]

character_classes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]

# These are the rows discussed in comments above
state_arrays = [ 0 1 2 2 2 2 2 2 2 2;
2 2 2 1 3 2 3 2 4 4;
3 3 2 2 2 2 2 2 2 2;
4 4 2 2 2 2 2 2 2 2;
6 6 2 2 2 2 2 2 2 2;
9 9 2 2 2 2 2 2 2 2;
8 8 2 2 2 2 2 2 2 2;
2 2 2 1 3 3 2 4 4 2;
2 2 2 2 2 2 2 2 2 2;
2 2 2 1 3 2 3 4 4 2;
5 5 2 2 2 2 2 2 2 2;
7 7 2 2 2 2 2 2 2 2]

#This converts the state_arrays into the shift encoded _UTF8DFAState
class_row = zeros(_UTF8DFAState, num_classes)

for i = 1:num_classes
row = _UTF8DFAState(0)
for j in 1:num_states
#Calculate the shift required for the next state
to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
#Shift the next state into the position of the current state
row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
end
class_row[i]=row
end

map(c->class_row[c+1],character_classes)
end


const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop

# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)

@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
for i = first:last
@inbounds state = _utf_dfa_step(state, bytes[i])
end
return (state)
end

@inline function _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
n=first
while n <= last - chunk_size
_isascii(cu,n,n+chunk_size-1) || return n
n += chunk_size
end
n= last-chunk_size+1
_isascii(cu,n,last) || return n
return nothing
end

##

byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
# Classifcations of string
# 0: neither valid ASCII nor UTF-8
# 1: valid ASCII
# 2: valid UTF-8
byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))


function byte_string_classify(bytes::AbstractVector{UInt8})
chunk_size = 1024
chunk_threshold = chunk_size + (chunk_size ÷ 2)
n = length(bytes)
if n > chunk_threshold
start = _find_nonascii_chunk(chunk_size,bytes,1,n)
isnothing(start) && return 1
else
_isascii(bytes,1,n) && return 1
start = 1
end
return _byte_string_classify_nonascii(bytes,start,n)
end

function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
chunk_size = 256

start = first
stop = min(last,first + chunk_size - 1)
state = _UTF8_DFA_ACCEPT
while start <= last
# try to process ascii chunks
while state == _UTF8_DFA_ACCEPT
_isascii(bytes,start,stop) || break
(start = start + chunk_size) <= last || break
stop = min(last,stop + chunk_size)
end
# Process non ascii chunk
state = _isvalid_utf8_dfa(state,bytes,start,stop)
state == _UTF8_DFA_INVALID && return 0

start = start + chunk_size
stop = min(last,stop + chunk_size)
end
return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
end

isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
isvalid(::Type{String}, s::AbstractString) = (@inline byte_string_classify(s)) ≠ 0

isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
isvalid(s::String) = isvalid(String, s)
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))

is_valid_continuation(c) = c & 0xc0 == 0x80

Expand Down
6 changes: 0 additions & 6 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,6 @@ function isvalid(s::SubString, i::Integer)
@inbounds return ib && isvalid(s.string, s.offset + i)::Bool
end

byte_string_classify(s::SubString{String}) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))

isvalid(::Type{String}, s::SubString{String}) = byte_string_classify(s) ≠ 0
isvalid(s::SubString{String}) = isvalid(String, s)

thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)

Expand Down
152 changes: 152 additions & 0 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1234,3 +1234,155 @@ end
end
@test_throws ArgumentError Symbol("a\0a")
end

@testset "Ensure UTF-8 DFA can never leave invalid state" begin
for b = typemin(UInt8):typemax(UInt8)
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_INVALID,[b],1,1) == Base._UTF8_DFA_INVALID
end
end
@testset "Ensure UTF-8 DFA stays in ASCII State for all ASCII" begin
for b = 0x00:0x7F
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
end
end

@testset "Validate UTF-8 DFA" begin
# Unicode 15
# Table 3-7. Well-Formed UTF-8 Byte Sequences

table_rows = [ [0x00:0x7F],
[0xC2:0xDF,0x80:0xBF],
[0xE0:0xE0,0xA0:0xBF,0x80:0xBF],
[0xE1:0xEC,0x80:0xBF,0x80:0xBF],
[0xED:0xED,0x80:0x9F,0x80:0xBF],
[0xEE:0xEF,0x80:0xBF,0x80:0xBF],
[0xF0:0xF0,0x90:0xBF,0x80:0xBF,0x80:0xBF],
[0xF1:0xF3,0x80:0xBF,0x80:0xBF,0x80:0xBF],
[0xF4:0xF4,0x80:0x8F,0x80:0xBF,0x80:0xBF]]
invalid_first_bytes = union(0xC0:0xC1,0xF5:0xFF,0x80:0xBF)

valid_first_bytes = union(collect(first(r) for r in table_rows)...)



# Prove that the first byte sets in the table & invalid cover all bytes
@test length(union(valid_first_bytes,invalid_first_bytes)) == 256
@test length(intersect(valid_first_bytes,invalid_first_bytes)) == 0

#Check the ASCII range
for b = 0x00:0x7F
#Test from both UTF-8 state and ascii state
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_ACCEPT
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_ASCII
end

#Check the remaining first bytes
for b = 0x80:0xFF
if b ∈ invalid_first_bytes
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) == Base._UTF8_DFA_INVALID
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) == Base._UTF8_DFA_INVALID
else
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b],1,1) != Base._UTF8_DFA_INVALID
@test Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b],1,1) != Base._UTF8_DFA_INVALID
end
end

# Check two byte Sequences
for table_row in [table_rows[2]]
b1 = first(table_row[1])
state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
@test state1 == state2
#Prove that all the first bytes in a row give same state
for b1 in table_row[1]
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
end
b1 = first(table_row[1])
#Prove that all valid second bytes return correct state
for b2 = table_row[2]
@test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end
for b2 = setdiff(0x00:0xFF,table_row[2])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end
end

# Check three byte Sequences
for table_row in table_rows[3:6]
b1 = first(table_row[1])
state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
@test state1 == state2
#Prove that all the first bytes in a row give same state
for b1 in table_row[1]
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
end

b1 = first(table_row[1])
b2 = first(table_row[2])
#Prove that all valid second bytes return same state
state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
for b2 = table_row[2]
@test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end
for b2 = setdiff(0x00:0xFF,table_row[2])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end

b2 = first(table_row[2])
#Prove that all valid third bytes return correct state
for b3 = table_row[3]
@test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state2,[b3],1,1)
end
for b3 = setdiff(0x00:0xFF,table_row[3])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
end
end

# Check Four byte Sequences
for table_row in table_rows[7:9]
b1 = first(table_row[1])
state1 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
state2 = Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
@test state1 == state2
#Prove that all the first bytes in a row give same state
for b1 in table_row[1]
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ACCEPT,[b1],1,1)
@test state1 == Base._isvalid_utf8_dfa(Base._UTF8_DFA_ASCII,[b1],1,1)
end

b1 = first(table_row[1])
b2 = first(table_row[2])
#Prove that all valid second bytes return same state
state2 = Base._isvalid_utf8_dfa(state1,[b2],1,1)
for b2 = table_row[2]
@test state2 == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end
for b2 = setdiff(0x00:0xFF,table_row[2])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state1,[b2],1,1)
end


b2 = first(table_row[2])
b3 = first(table_row[3])
state3 = Base._isvalid_utf8_dfa(state2,[b3],1,1)
#Prove that all valid third bytes return same state
for b3 = table_row[3]
@test state3 == Base._isvalid_utf8_dfa(state2,[b3],1,1)
end
for b3 = setdiff(0x00:0xFF,table_row[3])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state2,[b3],1,1)
end

b3 = first(table_row[3])
#Prove that all valid forth bytes return correct state
for b4 = table_row[4]
@test Base._UTF8_DFA_ACCEPT == Base._isvalid_utf8_dfa(state3,[b4],1,1)
end
for b4 = setdiff(0x00:0xFF,table_row[4])
@test Base._UTF8_DFA_INVALID == Base._isvalid_utf8_dfa(state3,[b4],1,1)
end
end
end