Skip to content

Commit

Permalink
Updated to use unsafe_checkstring, fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jul 1, 2015
1 parent 3c105b7 commit 000e7ae
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 503 deletions.
60 changes: 11 additions & 49 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,16 @@ convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s

"
Converts an `AbstractString` to a `UTF16String`
### Input Arguments:
* `::Type{UTF32String}`
* `str::AbstractString`
Converts an `AbstractString` to a `UTF32String`
### Returns:
* `::UTF32String`
* `UTF32String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF32String}, str::AbstractString)
len, flags = check_string(str)
len, flags = unsafe_checkstring(str)
buf = Vector{Char}(len+1)
out = 0
@inbounds for ch in str ; buf[out += 1] = ch ; end
Expand All @@ -40,12 +36,8 @@ end
"
Converts a UTF-32 encoded vector of `UInt32` to a `UTF8String`
### Input Arguments:
* `::Type{UTF8String}`
* `dat::Vector{UInt32}`
### Returns:
* `::UTF8String`
* `UTF8String`
### Throws:
* `UnicodeError`
Expand All @@ -55,20 +47,16 @@ function convert(::Type{UTF8String}, dat::Vector{UInt32})
# handle zero length string quickly
len == 0 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>2)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts a `UTF32String` to a `UTF8String`
### Input Arguments:
* `::Type{UTF8String}`
* `str::UTF32String`
### Returns:
* `::UTF8String`
* `UTF8String`
### Throws:
* `UnicodeError`
Expand All @@ -79,18 +67,14 @@ function convert(::Type{UTF8String}, str::UTF32String)
# handle zero length string quickly
len <= 1 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts a `UTF8String` to a `UTF32String`
### Input Arguments:
* `::Type{UTF32String}`
* `str::UTF8String`
### Returns:
* `::UTF32String`
Expand All @@ -102,7 +86,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf32
# Validate UTF-8 encoding, and get number of words to create
len, flags = check_string(dat)
len, flags = unsafe_checkstring(dat)
# Optimize case where no characters > 0x7f
flags == 0 && @inbounds return fast_utf_copy(UTF32String, Char, len, dat, true)
# has multi-byte UTF-8 sequences
Expand Down Expand Up @@ -145,10 +129,6 @@ end
"
Converts a `UTF16String` to `UTF32String`
### Input Arguments:
* `::Type{UTF32String}`
* `str::UTF16String`
### Returns:
* `::UTF32String`
Expand All @@ -161,7 +141,7 @@ function convert(::Type{UTF32String}, str::UTF16String)
# handle zero length string quickly (account for trailing \0)
len <= 2 && return empty_utf32
# get number of words to create
len, flags, num4byte = check_string(dat, len>>>1)
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1)
# No surrogate pairs, do optimized copy
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
local ch::UInt32
Expand All @@ -180,10 +160,6 @@ end
"
Converts a UTF-32 encoded vector of `UInt32` to a `UTF16String`
### Input Arguments:
* `::Type{UTF16String}`
* `dat::Vector{UInt32}`
### Returns:
* `::UTF16String`
Expand All @@ -195,7 +171,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string(dat, len>>>2)
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat)
Expand All @@ -205,10 +181,6 @@ end
"
Converts a `UTF32String` to `UTF16String`
### Input Arguments:
* `::Type{UTF16String}`
* `str::UTF32String`
### Returns:
* `::UTF16String`
Expand All @@ -221,7 +193,7 @@ function convert(::Type{UTF16String}, str::UTF32String)
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string(dat, len>>>2)
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
# optimized path, no surrogates
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
return encode_to_utf16(dat, len + num4byte)
Expand Down Expand Up @@ -268,16 +240,6 @@ function convert(::Type{UTF32String}, dat::AbstractVector{Char})
@inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true)
end

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
end

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
end

convert{T<:Union{Int32,UInt32}}(::Type{UTF32String}, data::AbstractVector{T}) =
convert(UTF32String, reinterpret(Char, data))

Expand Down
10 changes: 0 additions & 10 deletions base/utfcheck.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,7 @@ end
"
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
<<<<<<< HEAD
This function checks the bounds of the start and end positions
=======
This function checks the bounds of the start or end positions
>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs
Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked
### Input Arguments:
Expand All @@ -225,15 +221,9 @@ function checkstring end
checkstring(dat; kwargs...) = unsafe_checkstring(dat, start(dat), endof(dat); kwargs...)

# Make sure that beginning and end positions are bounds checked
<<<<<<< HEAD
function checkstring(dat, startpos, endpos = endof(dat); kwargs...)
checkbounds(dat,startpos)
checkbounds(dat,endpos)
endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)"))
=======
function checkstring(dat, startpos = start(dat), endpos = endof(dat); kwargs...)
startpos < 1 && throw(BoundsError(dat, startpos))
(startpos <= endpos <= endof(dat)) || throw(BoundsError(dat, endpos))
>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs
unsafe_checkstring(dat, startpos, endpos; kwargs...)
end
Loading

0 comments on commit 000e7ae

Please sign in to comment.