Skip to content

Commit cd6963d

Browse files
committed
Update to use CharSetEncodings
1 parent d8f7ffb commit cd6963d

File tree

11 files changed

+95
-236
lines changed

11 files changed

+95
-236
lines changed

src/Strs.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,17 @@ So: ASCIIStr would be: Valid, All ASCII, ... i.e. 0 + short/hash bits
7575
using CharSetEncodings
7676

7777
# From CharSetEncodings
78-
export CSE, CharSet, Encoding, cse, charset, encoding, basetype, basecse, MaybeSub
78+
export CSE, CharSet, Encoding, cse, charset, encoding, basetype, basecse, MaybeSub, is_multi
7979
export @cs_str, @enc_str, @cse
8080
export BIG_ENDIAN, LITTLE_ENDIAN
81+
82+
import CharSetEncodings: EncodingStyle, CharSetStyle, CompareStyle
83+
84+
# Convenience aliases
85+
const SingleCU = SingleCodeUnitEncoding
86+
const MultiCU = MultiCodeUnitEncoding
87+
const SCU = SingleCU()
88+
const MCU = MultiCU()
8189

8290
# Convenience functions
8391
export to_ascii, utf8, utf16, utf32

src/compat.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
152152
function sizeof(str::SubString{T}) where {T<:Str}
153153
is_multi(str) || return str.endof
154154
str.endof == 0 && return 0
155-
_nextind(CodeUnitMulti(), str.string, str.offset + str.endof) - str.offset - 1
155+
_nextind(MultiCU(), str.string, str.offset + str.endof) - str.offset - 1
156156
end
157157

158158
occurs_in(str::String, hay::String) = contains(hay, str)
@@ -178,7 +178,7 @@ for sym in (:bin, :oct, :dec, :hex)
178178
end
179179

180180
function repeat(ch::Char, cnt::Integer)
181-
cnt > 1 && return String(_repeat(CodeUnitMulti(), UTF8CSE, ch%UInt32, cnt))
181+
cnt > 1 && return String(_repeat(MultiCU(), UTF8CSE, ch%UInt32, cnt))
182182
cnt < 0 && repeaterr(cnt)
183183
cnt == 0 ? empty_string : string(Char(ch%UInt32))
184184
end

src/core.jl

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,44 +8,44 @@ Licensed under MIT License, see LICENSE.md
88
Inspired by / derived from code in Julia
99
=#
1010

11-
_lastindex(::CodeUnitSingle, str) = (@_inline_meta(); ncodeunits(str))
11+
_lastindex(::SingleCU, str) = (@_inline_meta(); ncodeunits(str))
1212

13-
@propagate_inbounds function _getindex(::CodeUnitSingle, T, str, pos::Int)
13+
@propagate_inbounds function _getindex(::SingleCU, T, str, pos::Int)
1414
@_inline_meta()
1515
@boundscheck checkbounds(str, pos)
1616
T(get_codeunit(pointer(str), pos))
1717
end
1818

19-
@propagate_inbounds function _next(::CodeUnitSingle, T, str, pos)
19+
@propagate_inbounds function _next(::SingleCU, T, str, pos)
2020
@_inline_meta()
2121
@boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
2222
T(get_codeunit(str, pos)), pos + 1
2323
end
2424

25-
_nextcpfun(::CodeUnitSingle, ::Type{S}, pnt::Ptr{T}) where {S,T<:CodeUnitTypes} =
25+
_nextcpfun(::SingleCU, ::Type{S}, pnt::Ptr{T}) where {S,T<:CodeUnitTypes} =
2626
get_codeunit(pnt), pnt + sizeof(T)
2727
_nextcp(::Type{T}, pnt) where {T} = _nextcpfun(CodePointStyle(T), T, pnt)
2828

29-
@propagate_inbounds _getindex(::CodeUnitMulti, T, str, pos::Int) =
30-
first(_next(CodeUnitMulti(), T, str, pos))
29+
@propagate_inbounds _getindex(::MultiCU, T, str, pos::Int) =
30+
first(_next(MultiCU(), T, str, pos))
3131

32-
@inline _length(::CodeUnitSingle, str) = ncodeunits(str)
32+
@inline _length(::SingleCU, str) = ncodeunits(str)
3333

3434
# Use more generic length check
3535
@inline _length_check(str::SubString{<:Str{C}}, cnt) where {C<:CSE} =
36-
_length(CodeUnitMulti(), C, pointer(str), cnt)
36+
_length(MultiCU(), C, pointer(str), cnt)
3737

3838
# Go directly to aligned length check
3939
@inline _length_check(str::Str{C}, cnt) where {C<:CSE} =
40-
_length_al(CodeUnitMulti(), C, pointer(str), cnt)
40+
_length_al(MultiCU(), C, pointer(str), cnt)
4141

42-
@inline _length(::CodeUnitMulti, str::MaybeSub{T}) where {T<:Str} =
42+
@inline _length(::MultiCU, str::MaybeSub{T}) where {T<:Str} =
4343
(cnt = ncodeunits(str); cnt < 2 ? Int(cnt > 0) : @preserve str _length_check(str, cnt))
4444

45-
@inline _length(::CodeUnitSingle, ::Type{<:CSE}, ::Ptr{<:CodeUnitTypes}, cnt::Int) = cnt
45+
@inline _length(::SingleCU, ::Type{<:CSE}, ::Ptr{<:CodeUnitTypes}, cnt::Int) = cnt
4646

47-
@inline _length(::CodeUnitMulti, str::Str{RawUTF8CSE}) = length(str.data)
48-
@inline _length(::CodeUnitMulti, str::Str{RawUTF8CSE}, i::Int, j::Int) = length(str.data, i, j)
47+
@inline _length(::MultiCU, str::Str{RawUTF8CSE}) = length(str.data)
48+
@inline _length(::MultiCU, str::Str{RawUTF8CSE}, i::Int, j::Int) = length(str.data, i, j)
4949

5050
@propagate_inbounds function _length(cs::CodePointStyle, str, i::Int, j::Int)
5151
@boundscheck begin
@@ -58,7 +58,7 @@ _nextcp(::Type{T}, pnt) where {T} = _nextcpfun(CodePointStyle(T), T, pnt)
5858
@preserve str _length(cs, cse(str), bytoff(pointer(str), i - 1), cnt)
5959
end
6060

61-
@inline _thisind(::CodeUnitSingle, str, len, pnt, pos) = Int(pos)
61+
@inline _thisind(::SingleCU, str, len, pnt, pos) = Int(pos)
6262

6363
@propagate_inbounds function _thisind(cs::CS, str, pos) where {CS<:CodePointStyle}
6464
@_inline_meta()
@@ -71,26 +71,26 @@ end
7171
@preserve str _thisind(cs, str, len, pointer(str), pos)
7272
end
7373

74-
@propagate_inbounds function _prevind(::CodeUnitSingle, str, i)
74+
@propagate_inbounds function _prevind(::SingleCU, str, i)
7575
@_inline_meta()
7676
@boundscheck 0 < i <= ncodeunits(str)+1 || boundserr(str, i)
7777
Int(i) - 1
7878
end
7979

80-
@propagate_inbounds function _prevind(::CodeUnitSingle, str, i, nchar)
80+
@propagate_inbounds function _prevind(::SingleCU, str, i, nchar)
8181
@_inline_meta()
8282
nchar < 0 && ncharerr(nchar)
8383
@boundscheck 0 < i <= ncodeunits(str)+1 || boundserr(str, i)
8484
max(Int(i) - nchar, 0)
8585
end
8686

87-
@propagate_inbounds function _nextind(::CodeUnitSingle, str, i)
87+
@propagate_inbounds function _nextind(::SingleCU, str, i)
8888
@_inline_meta()
8989
@boundscheck 0 <= i <= ncodeunits(str) || boundserr(str, i)
9090
Int(i) + 1
9191
end
9292

93-
@propagate_inbounds function _nextind(::CodeUnitSingle, str, i, nchar)
93+
@propagate_inbounds function _nextind(::SingleCU, str, i, nchar)
9494
@_inline_meta()
9595
nchar < 0 && ncharerr(nchar)
9696
@boundscheck 0 <= i <= ncodeunits(str) || boundserr(str, i)
@@ -149,20 +149,20 @@ end
149149
_isvalid_char_pos(CodePointStyle(T), cse(T), str, i)
150150
end
151151

152-
_isvalid_char_pos(::CodeUnitSingle, C, str, i) = true
152+
_isvalid_char_pos(::SingleCU, C, str, i) = true
153153

154-
@propagate_inbounds function _collectstr(::CodeUnitMulti, ::Type{S},
154+
@propagate_inbounds function _collectstr(::MultiCU, ::Type{S},
155155
str::MaybeSub{T}) where {S,T<:Str}
156-
len = _length(CodeUnitMulti(), str)
156+
len = _length(MultiCU(), str)
157157
vec = create_vector(S, len)
158158
pos = 1
159159
@inbounds for i = 1:len
160-
vec[i], pos = _next(CodeUnitMulti(), S, str, pos)
160+
vec[i], pos = _next(MultiCU(), S, str, pos)
161161
end
162162
vec
163163
end
164164

165-
@propagate_inbounds function _collectstr(::CodeUnitSingle, ::Type{S},
165+
@propagate_inbounds function _collectstr(::SingleCU, ::Type{S},
166166
str::MaybeSub{T}) where {S,T<:Str}
167167
len = ncodeunits(str)
168168
vec = create_vector(S, len)
@@ -219,7 +219,7 @@ function _convert(::Type{C}, ch::T) where {C<:CSE,T<:CodeUnitTypes}
219219
Str(C, buf)
220220
end
221221

222-
# Todo: These should be made more generic, work for all CodeUnitSingle types
222+
# Todo: These should be made more generic, work for all SingleCU types
223223

224224
convert(::Type{<:Str{ASCIICSE}}, ch::Unsigned) =
225225
is_ascii(ch) ? _convert(ASCIICSE, ch%UInt8) : unierror(UTF_ERR_INVALID_ASCII, 0, ch)
@@ -258,7 +258,7 @@ unsafe_convert(::Type{Ptr{Text4Chr}}, str::MaybeSub{<:Str{<:Quad_CSEs}}) =
258258
unsafe_convert(::Type{Ptr{Cvoid}}, s::MaybeSub{<:Str{C}}) where {C} =
259259
reinterpret(Ptr{Cvoid}, unsafe_convert(Ptr{codeunit(C)}, s))
260260

261-
function _reverse(::CodeUnitSingle, ::Type{C}, len, str::Str{C}) where {C<:CSE}
261+
function _reverse(::SingleCU, ::Type{C}, len, str::Str{C}) where {C<:CSE}
262262
len < 2 && return str
263263
@preserve str begin
264264
pnt = pointer(str)
@@ -273,9 +273,9 @@ function _reverse(::CodeUnitSingle, ::Type{C}, len, str::Str{C}) where {C<:CSE}
273273
end
274274
end
275275

276-
function _reverse(::CodeUnitMulti, ::Type{C}, len, str) where {C<:CSE}
276+
function _reverse(::MultiCU, ::Type{C}, len, str) where {C<:CSE}
277277
@inbounds ((t = nextind(str, 0)) > len || nextind(str, t) > len) && return str
278-
@preserve str _reverse(CodeUnitMulti(), C, len, pointer(str))
278+
@preserve str _reverse(MultiCU(), C, len, pointer(str))
279279
end
280280

281281
reverse(str::MaybeSub{T}) where {C<:CSE,T<:Str{C}} =

src/search.jl

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -166,19 +166,18 @@ function find(::Type{D}, ch::AbsChar, str::AbstractString, pos::Integer) where {
166166
@boundscheck pos > len+1 && boundserr(str, pos)
167167
return 0
168168
end
169-
# Only check if CodeUnitMulti
170-
(cus = CodePointStyle(str)) === CodeUnitMulti() &&
171-
(@inbounds is_valid(str, pos) || index_error(str, pos))
169+
# Only check if MultiCU
170+
is_multi(str) && (@inbounds is_valid(str, pos) || index_error(str, pos))
172171
# Check here if ch is valid for the type of string
173-
is_valid(eltype(str), ch) ? _srch_cp(D(), cus, str, ch, pos, len) : 0
172+
is_valid(eltype(str), ch) ? _srch_cp(D(), EncodingStyle(str), str, ch, pos, len) : 0
174173
end
175174

176175
_get_dir(::Type{First}) = Fwd()
177176
_get_dir(::Type{Last}) = Rev()
178177

179178
find(::Type{D}, ch::AbsChar, str::AbstractString) where {D<:Union{First,Last}} =
180179
((len = ncodeunits(str)) == 0 || !is_valid(eltype(str), ch) ? 0
181-
: _srch_cp(_get_dir(D), CodePointStyle(str), str, ch,
180+
: _srch_cp(_get_dir(D), EncodingStyle(str), str, ch,
182181
D === First ? 1 : lastindex(str), len))
183182

184183
function find(::Type{D}, needle::AbstractString, str::AbstractString,
@@ -198,7 +197,7 @@ function find(::Type{D}, needle::AbstractString, str::AbstractString,
198197
is_valid(eltype(str), ch) || return _not_found
199198
# Check if single character
200199
if nxt > tlen
201-
pos = _srch_cp(D(), CodePointStyle(str), str, ch, pos, slen)
200+
pos = _srch_cp(D(), EncodingStyle(str), str, ch, pos, slen)
202201
return pos == 0 ? _not_found : (pos:pos)
203202
end
204203
_srch_strings(D(), cmp, str, needle, ch, nxt, pos, slen, tlen)
@@ -214,7 +213,7 @@ function find(::Type{T}, needle::AbstractString, str::AbstractString) where {T<:
214213
is_valid(eltype(str), ch) || return _not_found
215214
# Check if single character
216215
if nxt > tlen
217-
pos = _srch_cp(_get_dir(T), CodePointStyle(str), str, ch, pos, slen)
216+
pos = _srch_cp(_get_dir(T), EncodingStyle(str), str, ch, pos, slen)
218217
return pos:(pos - (pos == 0))
219218
end
220219
_srch_strings(_get_dir(T), cmp, str, needle, ch, nxt, pos, slen, tlen)
@@ -274,10 +273,10 @@ end
274273

275274
# _srch_cp is only called with values that are valid for that string type,
276275
# and checking as already been done on the position (pos)
277-
# These definitions only work for CodeUnitSingle types
278-
_srch_cp(::Fwd, ::CodeUnitSingle, str::T, cp::AbsChar, pos, len) where {T<:Str} =
276+
# These definitions only work for SingleCU types
277+
_srch_cp(::Fwd, ::SingleCU, str::T, cp::AbsChar, pos, len) where {T<:Str} =
279278
@preserve str _srch_codeunit(Fwd(), pointer(str), cp%codeunit(T), pos, len)
280-
_srch_cp(::Rev, ::CodeUnitSingle, str::T, cp::AbsChar, pos, len) where {T<:Str} =
279+
_srch_cp(::Rev, ::SingleCU, str::T, cp::AbsChar, pos, len) where {T<:Str} =
281280
@preserve str _srch_codeunit(Rev(), pointer(str), cp%codeunit(T), pos)
282281

283282
function _srch_cp(::Fwd, cus, str, cp, pos, len)
@@ -313,7 +312,7 @@ end
313312
end
314313

315314
function _srch_strings(::Fwd, ::CompareStyle, str, needle, ch, subpos, pos, slen, tlen)
316-
cu = CodePointStyle(str)
315+
cu = EncodingStyle(str)
317316
while (pos = _srch_cp(Fwd(), cu, str, ch, pos, slen)) != 0
318317
nxt = nextind(str, pos)
319318
res = _cmp_str(str, nxt, slen, needle, subpos, tlen)
@@ -325,7 +324,7 @@ function _srch_strings(::Fwd, ::CompareStyle, str, needle, ch, subpos, pos, slen
325324
end
326325

327326
function _srch_strings(::Rev, ::CompareStyle, str, needle, ch, nxtsub, pos, slen, tlen)
328-
cu = CodePointStyle(str)
327+
cu = EncodingStyle(str)
329328
while (prv = prevind(str, pos)) != 0 &&
330329
(loc = _srch_cp(Rev(), cu, str, ch, prv, slen)) != 0
331330
res = _cmp_str(str, nextind(str, loc), slen, needle, nxtsub, tlen)
@@ -359,7 +358,7 @@ function _srch_str_bloom(::Fwd, str, spnt, npnt, ch, pos, slen, tlen)
359358
end
360359

361360
# match found
362-
j == tlen - 1 && return pos:_thisind(CodePointStyle(str), str, slen, spnt, pos + j)
361+
j == tlen - 1 && return pos:_thisind(EncodingStyle(str), str, slen, spnt, pos + j)
363362

364363
# no match, try to rule out the next character
365364
if pos <= slen && _check_bloom_mask(bloom_mask, spnt, pos + tlen)

src/support.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -933,10 +933,10 @@ end
933933
buf
934934
end
935935

936-
_repeat(::CodeUnitSingle, ::Type{C}, ch::T, cnt) where {T,C<:CSE} =
936+
_repeat(::SingleCU, ::Type{C}, ch::T, cnt) where {T,C<:CSE} =
937937
_repeat_chr(basetype(T), ch, cnt)
938938

939-
function _repeat(::CodeUnitMulti, ::Type{UTF8CSE}, ch, cnt)
939+
function _repeat(::MultiCU, ::Type{UTF8CSE}, ch, cnt)
940940
if ch <= 0x7f
941941
_repeat_chr(UInt8, ch, cnt)
942942
elseif ch <= 0x7ff
@@ -948,7 +948,7 @@ function _repeat(::CodeUnitMulti, ::Type{UTF8CSE}, ch, cnt)
948948
end
949949
end
950950

951-
_repeat(::CodeUnitMulti, ::Type{UTF16CSE}, ch, cnt) =
951+
_repeat(::MultiCU, ::Type{UTF16CSE}, ch, cnt) =
952952
ch <= 0xffff ? _repeat_chr(UInt16, ch, cnt) : _repeat_chr(UInt32, get_utf16_32(ch), cnt)
953953

954954
function repeat(str::T, cnt::Integer) where {C<:CSE,T<:Str{C}}

0 commit comments

Comments
 (0)