Skip to content

Commit

Permalink
Drop missing from juliaeltype if ValidityBitmap is all valid.
Browse files Browse the repository at this point in the history
  • Loading branch information
evetion committed Jul 27, 2023
1 parent f8d2203 commit 8cb11a7
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 47 deletions.
1 change: 1 addition & 0 deletions src/arraytypes/arraytypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ end

Base.size(p::ValidityBitmap) = (p.ℓ,)
nullcount(x::ValidityBitmap) = x.nc
Base.all(x::ValidityBitmap) = x.nc == 0

function ValidityBitmap(x)
T = eltype(x)
Expand Down
77 changes: 46 additions & 31 deletions src/eltypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,26 @@ arrowtype(b, col::AbstractVector{T}) where {T} = arrowtype(b, maybemissing(T))
arrowtype(b, col::DictEncoded) = arrowtype(b, col.encoding.data)
arrowtype(b, col::Compressed) = arrowtype(b, col.data)

function juliaeltype(f::Meta.Field, ::Nothing, convert::Bool)
T = juliaeltype(f, convert)
function juliaeltype(f::Meta.Field, ::Nothing, convert::Bool, allvalid)
T = juliaeltype(f, convert, allvalid)
return convert ? finaljuliatype(T) : T
end

function juliaeltype(f::Meta.Field, meta::AbstractDict{String,String}, convert::Bool)
TT = juliaeltype(f, convert)
function juliaeltype(
f::Meta.Field,
meta::AbstractDict{String,String},
convert::Bool,
allvalid::Bool=false,
)
TT = juliaeltype(f, convert, allvalid)
!convert && return TT
T = finaljuliatype(TT)
if haskey(meta, "ARROW:extension:name")
typename = meta["ARROW:extension:name"]
metadata = get(meta, "ARROW:extension:metadata", "")
JT = ArrowTypes.JuliaType(Val(Symbol(typename)), maybemissing(TT), metadata)
if JT !== nothing
return f.nullable ? Union{JT,Missing} : JT
return f.nullable && !allvalid ? Union{JT,Missing} : JT
else
@warn "unsupported ARROW:extension:name type: \"$typename\", arrow type = $TT" maxlog =
1 _id = hash((:juliaeltype, typename, TT))
Expand All @@ -57,19 +62,19 @@ function juliaeltype(f::Meta.Field, meta::AbstractDict{String,String}, convert::
return something(TT, T)
end

function juliaeltype(f::Meta.Field, convert::Bool)
T = juliaeltype(f, f.type, convert)
return f.nullable ? Union{T,Missing} : T
function juliaeltype(f::Meta.Field, convert::Bool, allvalid)
T = juliaeltype(f, f.type, convert, allvalid)
return f.nullable && !allvalid ? Union{T,Missing} : T
end

juliaeltype(f::Meta.Field, ::Meta.Null, convert) = Missing
juliaeltype(f::Meta.Field, ::Meta.Null, convert, allvalid) = Missing

function arrowtype(b, ::Type{Missing})
Meta.nullStart(b)
return Meta.Null, Meta.nullEnd(b), nothing
end

function juliaeltype(f::Meta.Field, int::Meta.Int, convert)
function juliaeltype(f::Meta.Field, int::Meta.Int, convert, allvalid)
if int.is_signed
if int.bitWidth == 8
Int8
Expand Down Expand Up @@ -109,7 +114,7 @@ function arrowtype(b, ::Type{T}) where {T<:Integer}
end

# primitive types
function juliaeltype(f::Meta.Field, fp::Meta.FloatingPoint, convert)
function juliaeltype(f::Meta.Field, fp::Meta.FloatingPoint, convert, allvalid)
if fp.precision == Meta.Precision.HALF
Float16
elseif fp.precision == Meta.Precision.SINGLE
Expand All @@ -129,20 +134,21 @@ function arrowtype(b, ::Type{T}) where {T<:AbstractFloat}
return Meta.FloatingPoint, Meta.floatingPointEnd(b), nothing
end

juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert) = String
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert, allvalid) = String

datasizeof(x) = sizeof(x)
datasizeof(x::AbstractVector) = sum(datasizeof, x)

juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert) = Base.CodeUnits
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert, allvalid) =
Base.CodeUnits

juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert) =
juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert, allvalid) =
NTuple{Int(x.byteWidth),UInt8}

# arggh!
Base.write(io::IO, x::NTuple{N,T}) where {N,T} = sum(y -> Base.write(io, y), x)

juliaeltype(f::Meta.Field, x::Meta.Bool, convert) = Bool
juliaeltype(f::Meta.Field, x::Meta.Bool, convert, allvalid) = Bool

function arrowtype(b, ::Type{Bool})
Meta.boolStart(b)
Expand All @@ -157,7 +163,7 @@ Base.zero(::Type{Decimal{P,S,T}}) where {P,S,T} = Decimal{P,S,T}(T(0))
==(a::Decimal{P,S,T}, b::Decimal{P,S,T}) where {P,S,T} = ==(a.value, b.value)
Base.isequal(a::Decimal{P,S,T}, b::Decimal{P,S,T}) where {P,S,T} = isequal(a.value, b.value)

function juliaeltype(f::Meta.Field, x::Meta.Decimal, convert)
function juliaeltype(f::Meta.Field, x::Meta.Decimal, convert, allvalid)
return Decimal{x.precision,x.scale,x.bitWidth == 256 ? Int256 : Int128}
end

Expand Down Expand Up @@ -188,7 +194,7 @@ bitwidth(x::Meta.DateUnit.T) = x == Meta.DateUnit.DAY ? Int32 : Int64
Date{Meta.DateUnit.DAY}(days) = DATE(Int32(days))
Date{Meta.DateUnit.MILLISECOND}(ms) = Date{Meta.DateUnit.MILLISECOND,Int64}(Int64(ms))

juliaeltype(f::Meta.Field, x::Meta.Date, convert) = Date{x.unit,bitwidth(x.unit)}
juliaeltype(f::Meta.Field, x::Meta.Date, convert, allvalid) = Date{x.unit,bitwidth(x.unit)}
finaljuliatype(::Type{DATE}) = Dates.Date
Base.convert(::Type{Dates.Date}, x::DATE) =
Dates.Date(Dates.UTD(Int64(x.x + UNIX_EPOCH_DATE)))
Expand Down Expand Up @@ -228,7 +234,7 @@ bitwidth(x::Meta.TimeUnit.T) =
x == Meta.TimeUnit.SECOND || x == Meta.TimeUnit.MILLISECOND ? Int32 : Int64
Time{U}(x) where {U<:Meta.TimeUnit.T} = Time{U,bitwidth(U)}(bitwidth(U)(x))
storagetype(::Type{Time{U,T}}) where {U,T} = T
juliaeltype(f::Meta.Field, x::Meta.Time, convert) = Time{x.unit,bitwidth(x.unit)}
juliaeltype(f::Meta.Field, x::Meta.Time, convert, allvalid) = Time{x.unit,bitwidth(x.unit)}
finaljuliatype(::Type{<:Time}) = Dates.Time
periodtype(U::Meta.TimeUnit.T) =
U === Meta.TimeUnit.SECOND ? Dates.Second :
Expand Down Expand Up @@ -260,7 +266,7 @@ end

Base.zero(::Type{Timestamp{U,T}}) where {U,T} = Timestamp{U,T}(Int64(0))

function juliaeltype(f::Meta.Field, x::Meta.Timestamp, convert)
function juliaeltype(f::Meta.Field, x::Meta.Timestamp, convert, allvalid)
return Timestamp{x.unit,x.timezone === nothing ? nothing : Symbol(x.timezone)}
end

Expand Down Expand Up @@ -381,7 +387,7 @@ Interval{Meta.IntervalUnit.YEAR_MONTH}(x) =
Interval{Meta.IntervalUnit.DAY_TIME}(x) =
Interval{Meta.IntervalUnit.DAY_TIME,Int64}(Int64(x))

function juliaeltype(f::Meta.Field, x::Meta.Interval, convert)
function juliaeltype(f::Meta.Field, x::Meta.Interval, convert, allvalid)
return Interval{x.unit,bitwidth(x.unit)}
end

Expand All @@ -397,7 +403,7 @@ end

Base.zero(::Type{Duration{U}}) where {U} = Duration{U}(Int64(0))

function juliaeltype(f::Meta.Field, x::Meta.Duration, convert)
function juliaeltype(f::Meta.Field, x::Meta.Duration, convert, allvalid)
return Duration{x.unit}
end

Expand Down Expand Up @@ -428,8 +434,15 @@ ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = peri
ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x)

# nested types; call juliaeltype recursively on nested children
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList}, convert)
return Vector{juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)}
function juliaeltype(
f::Meta.Field,
list::Union{Meta.List,Meta.LargeList},
convert,
allvalid=false,
)
return Vector{
juliaeltype(f.children[1], buildmetadata(f.children[1]), convert, allvalid),
}
end

# arrowtype will call fieldoffset recursively for children
Expand Down Expand Up @@ -464,8 +477,8 @@ function arrowtype(b, x::List{T,O,A}) where {T,O,A}
end
end

function juliaeltype(f::Meta.Field, list::Meta.FixedSizeList, convert)
type = juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)
function juliaeltype(f::Meta.Field, list::Meta.FixedSizeList, convert, allvalid=false)
type = juliaeltype(f.children[1], buildmetadata(f.children[1]), convert, allvalid)
return NTuple{Int(list.listSize),type}
end

Expand All @@ -485,16 +498,18 @@ function arrowtype(b, x::FixedSizeList{T,A}) where {T,A}
end
end

function juliaeltype(f::Meta.Field, map::Meta.Map, convert)
function juliaeltype(f::Meta.Field, map::Meta.Map, convert, allvalid)
K = juliaeltype(
f.children[1].children[1],
buildmetadata(f.children[1].children[1]),
convert,
allvalid,
)
V = juliaeltype(
f.children[1].children[2],
buildmetadata(f.children[1].children[2]),
convert,
allvalid,
)
return Dict{K,V}
end
Expand All @@ -521,9 +536,9 @@ function arrowtype(b, ::Type{KeyValue{K,V}}) where {K,V}
return Meta.Struct, Meta.structEnd(b), children
end

function juliaeltype(f::Meta.Field, list::Meta.Struct, convert)
function juliaeltype(f::Meta.Field, list::Meta.Struct, convert, allvalid)
names = Tuple(Symbol(x.name) for x in f.children)
types = Tuple(juliaeltype(x, buildmetadata(x), convert) for x in f.children)
types = Tuple(juliaeltype(x, buildmetadata(x), convert, allvalid) for x in f.children)
return NamedTuple{names,Tuple{types...}}
end

Expand All @@ -540,13 +555,13 @@ function UnionT(f::Meta.Field, convert)
UT = UnionT{
f.type.mode,
typeids,
Tuple{(juliaeltype(x, buildmetadata(x), convert) for x in f.children)...},
Tuple{(juliaeltype(x, buildmetadata(x), convert, false) for x in f.children)...},
}
return UT
end

juliaeltype(f::Meta.Field, u::Meta.Union, convert) =
Union{(juliaeltype(x, buildmetadata(x), convert) for x in f.children)...}
juliaeltype(f::Meta.Field, u::Meta.Union, convert, allvalid) =
Union{(juliaeltype(x, buildmetadata(x), convert, allvalid) for x in f.children)...}

function arrowtype(
b,
Expand Down
41 changes: 25 additions & 16 deletions src/table.jl
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,12 @@ function Base.iterate(x::Stream, (pos, id)=(1, 0))
push!(x.names, Symbol(field.name))
push!(
x.types,
juliaeltype(field, buildmetadata(field.custom_metadata), x.convert),
juliaeltype(
field,
buildmetadata(field.custom_metadata),
x.convert,
false,
),
)
# recursively find any dictionaries for any fields
getdictionaries!(x.dictencoded, field)
Expand Down Expand Up @@ -242,7 +247,7 @@ function Base.iterate(x::Stream, (pos, id)=(1, 0))
A = ChainedVector([values])
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
x.dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand Down Expand Up @@ -486,7 +491,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
A = ChainedVector([dictencoding.data, values])
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand All @@ -511,7 +516,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
A = values
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand Down Expand Up @@ -539,7 +544,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
# 158; some implementations may send 0 record batches
if !anyrecordbatches && !isnothing(sch)
for field in sch.fields
T = juliaeltype(field, buildmetadata(field), convert)
T = juliaeltype(field, buildmetadata(field), convert, false)
push!(columns(t), T[])
end
end
Expand Down Expand Up @@ -652,7 +657,10 @@ function build(field::Meta.Field, batch, rb, de, nodeidx, bufferidx, convert)
validity = buildbitmap(batch, rb, nodeidx, bufferidx)
bufferidx += 1
buffer = rb.buffers[bufferidx]
S = d.indexType === nothing ? Int32 : juliaeltype(field, d.indexType, false)
S =
d.indexType === nothing ? Int32 :
juliaeltype(field, d.indexType, false, all(validity))

bytes, indices = reinterp(S, batch, buffer, rb.compression)
encoding = de[d.id]
A = DictEncoded(
Expand Down Expand Up @@ -757,7 +765,7 @@ function build(f::Meta.Field, L::ListTypes, batch, rb, de, nodeidx, bufferidx, c
len = rb.nodes[nodeidx].length
nodeidx += 1
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
if L isa Meta.Utf8 ||
L isa Meta.LargeUtf8 ||
L isa Meta.Binary ||
Expand Down Expand Up @@ -804,7 +812,7 @@ function build(
build(f.children[1], batch, rb, de, nodeidx, bufferidx, convert)
end
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return FixedSizeList{T,typeof(A)}(bytes, validity, A, len, meta), nodeidx, bufferidx
end

Expand All @@ -822,7 +830,7 @@ function build(f::Meta.Field, L::Meta.Map, batch, rb, de, nodeidx, bufferidx, co
nodeidx += 1
A, nodeidx, bufferidx = build(f.children[1], batch, rb, de, nodeidx, bufferidx, convert)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return Map{T,OT,typeof(A)}(validity, offsets, A, len, meta), nodeidx, bufferidx
end

Expand All @@ -839,12 +847,13 @@ function build(f::Meta.Field, L::Meta.Struct, batch, rb, de, nodeidx, bufferidx,
end
data = Tuple(vecs)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return Struct{T,typeof(data)}(validity, data, len, meta), nodeidx, bufferidx
end

function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx, convert)
@debugv 2 "building array: L = $L"
validity = buildbitmap(batch, rb, nodeidx, bufferidx)
buffer = rb.buffers[bufferidx]
bytes, typeIds = reinterp(UInt8, batch, buffer, rb.compression)
bufferidx += 1
Expand All @@ -861,7 +870,7 @@ function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx,
end
data = Tuple(vecs)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
UT = UnionT(f, convert)
if L.mode == Meta.UnionMode.Dense
B = DenseUnion{T,UT,typeof(data)}(bytes, bytes2, typeIds, offsets, data, meta)
Expand All @@ -874,7 +883,7 @@ end
function build(f::Meta.Field, L::Meta.Null, batch, rb, de, nodeidx, bufferidx, convert)
@debugv 2 "building array: L = $L"
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, false)
return NullVector{maybemissing(T)}(MissingVector(rb.nodes[nodeidx].length), meta),
nodeidx + 1,
bufferidx
Expand All @@ -888,11 +897,11 @@ function build(f::Meta.Field, ::L, batch, rb, de, nodeidx, bufferidx, convert) w
buffer = rb.buffers[bufferidx]
meta = buildmetadata(f.custom_metadata)
# get storage type (non-converted)
T = juliaeltype(f, nothing, false)
T = juliaeltype(f, nothing, false, all(validity))
@debugv 2 "storage type for primitive: T = $T"
bytes, A = reinterp(Base.nonmissingtype(T), batch, buffer, rb.compression)
len = rb.nodes[nodeidx].length
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
@debugv 2 "final julia type for primitive: T = $T"
return Primitive(T, bytes, validity, A, len, meta), nodeidx + 1, bufferidx + 1
end
Expand All @@ -904,7 +913,7 @@ function build(f::Meta.Field, L::Meta.Bool, batch, rb, de, nodeidx, bufferidx, c
buffer = rb.buffers[bufferidx]
meta = buildmetadata(f.custom_metadata)
# get storage type (non-converted)
T = juliaeltype(f, nothing, false)
T = juliaeltype(f, nothing, false, all(validity))
@debugv 2 "storage type for primitive: T = $T"
buffer = rb.buffers[bufferidx]
voff = batch.pos + buffer.offset
Expand All @@ -921,6 +930,6 @@ function build(f::Meta.Field, L::Meta.Bool, batch, rb, de, nodeidx, bufferidx, c
# return ValidityBitmap(decodedbytes, 1, node.length, node.null_count)
end
len = rb.nodes[nodeidx].length
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return BoolVector{T}(decodedbytes, pos, validity, len, meta), nodeidx + 1, bufferidx + 1
end

0 comments on commit 8cb11a7

Please sign in to comment.