Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: ByteVec string implementation #11235

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 68 additions & 67 deletions base/ascii.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## from base/boot.jl:
#
# immutable ASCIIString <: DirectIndexString
# data::Array{UInt8,1}
# data::ByteVec
# end
#

Expand All @@ -22,82 +22,83 @@ getindex(s::ASCIIString, indx::AbstractVector{Int}) = ASCIIString(s.data[indx])
search(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
rsearch(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? rsearch(s.data,c%UInt8,i) : 0

function string(c::ASCIIString...)
if length(c) == 1
return c[1]
end
n = 0
for s in c
n += length(s.data)
end
v = Array(UInt8,n)
o = 1
for s in c
ls = length(s.data)
unsafe_copy!(v, o, s.data, 1, ls)
o += ls
end
ASCIIString(v)
end
# function string(c::ASCIIString...)
# if length(c) == 1
# return c[1]
# end
# n = 0
# for s in c
# n += length(s.data)
# end
# v = Array(UInt8,n)
# o = 1
# for s in c
# ls = length(s.data)
# unsafe_copy!(v, o, s.data, 1, ls)
# o += ls
# end
# ASCIIString(v)
# end

function ucfirst(s::ASCIIString)
if length(s) > 0 && 'a' <= s[1] <= 'z'
t = ASCIIString(copy(s.data))
t.data[1] -= 32
return t
end
return s
end
function lcfirst(s::ASCIIString)
if length(s) > 0 && 'A' <= s[1] <= 'Z'
t = ASCIIString(copy(s.data))
t.data[1] += 32
return t
end
return s
end
# function ucfirst(s::ASCIIString)
# if length(s) > 0 && 'a' <= s[1] <= 'z'
# t = ASCIIString(copy(s.data))
# t.data[1] -= 32
# return t
# end
# return s
# end
# function lcfirst(s::ASCIIString)
# if length(s) > 0 && 'A' <= s[1] <= 'Z'
# t = ASCIIString(copy(s.data))
# t.data[1] += 32
# return t
# end
# return s
# end

function uppercase(s::ASCIIString)
d = s.data
for i = 1:length(d)
if 'a' <= Char(d[i]) <= 'z'
td = copy(d)
for j = i:length(td)
if 'a' <= Char(td[j]) <= 'z'
td[j] -= 32
end
end
return ASCIIString(td)
end
end
return s
end
function lowercase(s::ASCIIString)
d = s.data
for i = 1:length(d)
if 'A' <= Char(d[i]) <= 'Z'
td = copy(d)
for j = i:length(td)
if 'A' <= Char(td[j]) <= 'Z'
td[j] += 32
end
end
return ASCIIString(td)
end
end
return s
end
# function uppercase(s::ASCIIString)
# d = s.data
# for i = 1:length(d)
# if 'a' <= Char(d[i]) <= 'z'
# td = copy(d)
# for j = i:length(td)
# if 'a' <= Char(td[j]) <= 'z'
# td[j] -= 32
# end
# end
# return ASCIIString(td)
# end
# end
# return s
# end
# function lowercase(s::ASCIIString)
# d = s.data
# for i = 1:length(d)
# if 'A' <= Char(d[i]) <= 'Z'
# td = copy(d)
# for j = i:length(td)
# if 'A' <= Char(td[j]) <= 'Z'
# td[j] += 32
# end
# end
# return ASCIIString(td)
# end
# end
# return s
# end

reverse(s::ASCIIString) = ASCIIString(reverse(s.data))
# reverse(s::ASCIIString) = ASCIIString(reverse(s.data))

## outputing ASCII strings ##
# ## outputing ASCII strings ##

write(io::IO, s::ASCIIString) = write(io, s.data)
# write(io::IO, s::ASCIIString) = write(io, s.data)

## transcoding to ASCII ##

ascii(x) = convert(ASCIIString, x)
convert(::Type{ASCIIString}, s::ASCIIString) = s
convert(::Type{ASCIIString}, b::ByteVec) = ASCIIString(b)
convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data)
convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin
is_valid_ascii(a) || throw(ArgumentError("invalid ASCII sequence"))
Expand Down
16 changes: 13 additions & 3 deletions base/boot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ export
Signed, Int, Int8, Int16, Int32, Int64, Int128,
Unsigned, UInt, UInt8, UInt16, UInt32, UInt64, UInt128,
# string types
Char, ASCIIString, ByteString, DirectIndexString, AbstractString, UTF8String,
Char, ByteVec, ASCIIString, ByteString, DirectIndexString, AbstractString, UTF8String,
# errors
BoundsError, DivideError, DomainError, Exception,
InexactError, InterruptException, OutOfMemoryError, OverflowError,
Expand Down Expand Up @@ -200,10 +200,20 @@ bitstype 128 UInt128 <: Unsigned

if is(Int,Int64)
typealias UInt UInt64
immutable ByteVec <: DenseArray{UInt8,1}
x::Int128
end
else
typealias UInt UInt32
immutable ByteVec <: DenseArray{UInt8,1}
x::Int64
end
end

## kind of want this but doesn't work:
#
# bitstype 128 ByteVec <: DenseArray{UInt8,1}

abstract Exception

immutable BoundsError <: Exception
Expand Down Expand Up @@ -240,11 +250,11 @@ immutable GlobalRef
end

immutable ASCIIString <: DirectIndexString
data::Array{UInt8,1}
data::ByteVec
end

immutable UTF8String <: AbstractString
data::Array{UInt8,1}
data::ByteVec
end

typealias ByteString Union(ASCIIString,UTF8String)
Expand Down
62 changes: 62 additions & 0 deletions base/bytevec.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
convert(::Type{Vector{UInt8}}, b::ByteVec) = [b[i] for i=1:length(b)]
convert(::Type{ByteVec}, s::AbstractString) = ByteVec(bytestring(s).data)
convert(::Type{ByteVec}, v::Vector{UInt8}) =
ccall(:jl_bytevec, ByteVec, (Ptr{UInt8}, Csize_t), v, length(v))

const bytevec_buf = Vector{UInt8}(1024)

function unsafe_convert(::Type{Ptr{UInt8}}, b::ByteVec)
println("D")
println(b.x)
b.x < 0 && return reinterpret(Ptr{Uint8}, b.x % UInt)
p = pointer(bytevec_buf)
unsafe_store!(convert(Ptr{ByteVec}, p), b)
return p
end

function length(b::ByteVec)
here = (b.x >>> 8*(sizeof(b.x)-1)) % Int
there = -(b.x >> 8*sizeof(Int)) % Int
ifelse(b.x < 0, there, here)
end

size(b::ByteVec) = (length(b),)

getindex(b::ByteVec, i::Real) =
box(UInt8, bytevec_ref(unbox(typeof(b.x), b.x), unbox(Int, Int(i))))
getu32(b::ByteVec, i::Int) =
box(UInt32, bytevec_ref32(unbox(typeof(b.x), b.x), unbox(Int, i)))

function ==(a::ByteVec, b::ByteVec)
a_hi = (a.x >> 8*sizeof(Int)) % Int
b_hi = (b.x >> 8*sizeof(Int)) % Int
(a_hi != b_hi) | (a_hi >= 0) | (b_hi >= 0) && return a.x == b.x
pa = reinterpret(Ptr{Uint8}, a.x % UInt)
pb = reinterpret(Ptr{Uint8}, b.x % UInt)
ccall(:memcmp, Cint, (Ptr{Uint8}, Ptr{Uint8}, Csize_t), pa, pb, -a_hi % Uint) == 0
end

function cmp(a::ByteVec, b::ByteVec)
a_x, b_x = a.x, b.x
a_here, b_here = a_x >= 0, b_x >= 0
if !(a_here & b_here)
if b_here
a_x = unsafe_load(reinterpret(Ptr{typeof(a_x)}, a_x % UInt))
elseif a_here
b_x = unsafe_load(reinterpret(Ptr{typeof(b_x)}, b_x % UInt))
else
pa = reinterpret(Ptr{Uint8}, a_x % UInt)
pb = reinterpret(Ptr{Uint8}, b_x % UInt)
la = -(a_x >>> 8*sizeof(Int)) % UInt
lb = -(b_x >>> 8*sizeof(Int)) % UInt
c = Int(ccall(:memcmp, Cint, (Ptr{Uint8}, Ptr{Uint8}, Csize_t), pa, pb, min(la,lb)))
return ifelse(c == 0, cmp(la,lb), sign(c))
end
end
cmp(bswap(a_x), bswap(b_x))
end
isless(x::ByteVec, y::ByteVec) = cmp(x, y) < 0

start(b::ByteVec) = 1
next(b::ByteVec, i::Int) = (b[i], i+1)
done(b::ByteVec, i::Int) = length(b) < i
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export
BitArray,
BitMatrix,
BitVector,
ByteVec,
BufferStream,
CartesianIndex,
CartesianRange,
Expand Down
3 changes: 2 additions & 1 deletion base/expr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ gensym() = ccall(:jl_gensym, Any, ())::Symbol

gensym(s::ASCIIString) = gensym(s.data)
gensym(s::UTF8String) = gensym(s.data)
gensym(b::ByteVec) = gensym(convert(Vector{UInt8}, b))
gensym(a::Array{UInt8,1}) =
ccall(:jl_tagged_gensym, Any, (Ptr{UInt8}, Int32), a, length(a))::Symbol
gensym(ss::Union(ASCIIString, UTF8String)...) = map(gensym, ss)
Expand All @@ -22,7 +23,7 @@ gensym(s::Symbol) =
macro gensym(names...)
blk = Expr(:block)
for name in names
push!(blk.args, :($(esc(name)) = gensym($(string(name)))))
push!(blk.args, :($(esc(name)) = gensym($(Expr(:quote, name)))))
end
push!(blk.args, :nothing)
return blk
Expand Down
28 changes: 26 additions & 2 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,21 @@ type Regex


function Regex(pattern::AbstractString, options::Integer)
println("B")
println(typeof(pattern))
println(pattern.data.x)
if pattern.data.x < 0
p = reinterpret(Ptr{Int128}, pattern.data.x % UInt)
println(unsafe_load(p))
end
pattern = bytestring(pattern)
println("C")
println(typeof(pattern))
println(pattern.data.x)
if pattern.data.x < 0
p = reinterpret(Ptr{Int128}, pattern.data.x % UInt)
println(unsafe_load(p))
end
options = Int32(options)
if (options & ~PCRE.OPTIONS_MASK) != 0
throw(ArgumentError("invalid regex options: $options"))
Expand All @@ -29,6 +43,7 @@ type Regex
re
end
end
Regex(pattern::AbstractString) = Regex(pattern, DEFAULT_OPTS)

function Regex(pattern::AbstractString, flags::AbstractString)
options = DEFAULT_OPTS
Expand All @@ -41,7 +56,6 @@ function Regex(pattern::AbstractString, flags::AbstractString)
end
Regex(pattern, options)
end
Regex(pattern::AbstractString) = Regex(pattern, DEFAULT_OPTS)

function compile(regex::Regex)
if regex.regex == C_NULL
Expand All @@ -54,7 +68,17 @@ function compile(regex::Regex)
regex
end

macro r_str(pattern, flags...) Regex(pattern, flags...) end
macro r_str(pattern, flags...)
println("A")
println(typeof(pattern))
println(pattern.data.x)
if pattern.data.x < 0
p = reinterpret(Ptr{Int128}, pattern.data.x % UInt)
println(unsafe_load(p))
end
println(flags)
Regex(pattern, flags...)
end

copy(r::Regex) = r

Expand Down
5 changes: 1 addition & 4 deletions base/simdloop.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@ function compile(x)
check_body!(x)

var,range = parse_iteration_space(x.args[1])
r = gensym("r") # Range value
j = gensym("i") # Iteration variable for outer loop
n = gensym("n") # Trip count for inner loop
i = gensym("i") # Trip index for inner loop
@gensym r j n i # range value, iteration var, outer trip count, inner trip count
quote
# Evaluate range value once, to enhance type and data flow analysis by optimizers.
let $r = $range
Expand Down
9 changes: 5 additions & 4 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ eval(m,x) = Core.eval(m,x)

include("exports.jl")

if false
if true
# simple print definitions for debugging. enable these if something
# goes wrong during bootstrap before printing code is available.
show(x::ANY) = ccall(:jl_static_show, Void, (Ptr{Void}, Any),
Expand Down Expand Up @@ -70,6 +70,7 @@ include("dict.jl")
include("set.jl")
include("hashing.jl")
include("iterator.jl")
include("bytevec.jl")

# SIMD loops
include("simdloop.jl")
Expand All @@ -91,8 +92,8 @@ include("utf16.jl")
include("utf32.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
importall .UTF8proc
# include("utf8proc.jl")
# importall .UTF8proc
include("regex.jl")
include("base64.jl")
importall .Base64
Expand All @@ -106,7 +107,7 @@ include("libc.jl")
using .Libc: getpid, gethostname, time, msync
include("libdl.jl")
using .Libdl: DL_LOAD_PATH
include("env.jl")
# include("env.jl")
include("path.jl")
include("intfuncs.jl")

Expand Down
Loading