Skip to content

Commit

Permalink
Implement C Data integration
Browse files Browse the repository at this point in the history
This starts work towards supporting teh C data interface for the arrow
format, as documented
[here](https://arrow.apache.org/docs/format/CDataInterface.html#).

Currently in this PR, it includes struct definitions and basic
methods to allow getting a pointer to an `ArrowSchema`/`ArrowArray`
C-compatible struct that can then be populated by another
implementation. For example, with this PR, you can do:

```julia
using Arrow, PyCall
pd = pyimport("pandas")
pa = pyimport("pyarrow")
df = pd.DataFrame(py"""{'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e']}"""o)
rb = pa.record_batch(df)
sch = Arrow.CData.getschema() do ptr
    rb.schema._export_to_c(Int(ptr))
end
arr = Arrow.CData.getarray() do ptr
    rb._export_to_c(Int(ptr))
end
```

Currently, these `ArrowSchema`/`ArrowArray` structs are pretty bare
bones, but it at least lays some ground work for integration. Things we
still need/want to make all this nicer to use/work with:

  * Type format string parsing/converting: we need to parse the type
  format strings as outlined
  [here](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings)
  to figure out what type of data we'll get in the arrays. It'd
  probably be best to add a `type` field to the ArrowSchema struct that
  we'd populate when converting from `CArrowSchema` -> `ArrowSchema`
  * Add a method like `Arrow.ArrowVector(::ArrowSchema, ::ArrowArray)`
  that produced a concrete `ArrowVector` subtype, like
  `Arrow.Primitive`, `Arrow.List`, etc. This will be a bit tricky,
  because have to follow all the same columnar layout trickery that we
  currently handle for IPC in the table.jl `build` methods. Perhaps we
  can refactor all that so we can re-use some code? Otherwise, we might
  just need to reimplement a bunch of that logic specific to converting
  `ArrrowArray`s.
  * That should give a robust consuming story; for producing, we
  probably need a definition like
  `Arrow.ArrowSchema(a::Arrow.ArrowVector)` that produced a valid
  `ArrowSchema`, and then overloads per `ArrowVector` subtype like
  `Arrow.ArrowArray(x::Arrow.Primitive)` that produced the right
  `ArrowArray` for a concrete arrow array
  * Then the last piece we need is just figuring out the right mechanics
  for providing a pointer to the `CArrowSchema`, `CArrowArray` structs
  once they're populated

If anyone would like to help out, I'm happy to provide as much guidance
as possible so others can get their feet wet in some arrow spec
nitty-gritty.
  • Loading branch information
quinnj committed Apr 16, 2021
1 parent bdd0e54 commit 005c946
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/Arrow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ include("arraytypes/arraytypes.jl")
include("eltypes.jl")
include("table.jl")
include("write.jl")
include("cinterface.jl")

const LZ4_FRAME_COMPRESSOR = LZ4FrameCompressor[]
const ZSTD_COMPRESSOR = ZstdCompressor[]
Expand Down
165 changes: 165 additions & 0 deletions src/cinterface.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
module CData

export ArrowSchema, ArrowArray, getschema, getarray

const ARROW_FLAG_DICTIONARY_ORDERED = 1
const ARROW_FLAG_NULLABLE = 2
const ARROW_FLAG_MAP_KEYS_SORTED = 4

struct CArrowSchema
format::Ptr{UInt8}
name::Ptr{UInt8}
metadata::Ptr{UInt8}
flags::Int64
n_children::Int64
children::Ptr{Ptr{CArrowSchema}}
dictionary::Ptr{CArrowSchema}
release::Ptr{Cvoid}
private_data::Ptr{Cvoid}
end

CArrowSchema() = CArrowSchema(C_NULL, C_NULL, C_NULL, 0, 0, C_NULL, C_NULL, _CNULL, C_NULL)

Base.propertynames(::CArrowSchema) = (:format, :name, :metadata, :flags, :n_children, :children, :dictionary)

function readmetadata(ptr::Ptr{UInt8})
pos = 1
meta = Dict{String, String}()
if ptr != C_NULL
n_entries = unsafe_load(convert(Ptr{Int32}, ptr))
ptr += 4
for _ = 1:n_entries
keylen = unsafe_load(convert(Ptr{Int32}, ptr))
ptr += 4
key = unsafe_string(ptr, keylen)
ptr += keylen
vallen = unsafe_load(convert(Ptr{Int32}, ptr))
ptr += 4
val = unsafe_string(ptr, vallen)
ptr += vallen
meta[key] = val
end
end
return meta
end

function Base.getproperty(x::CArrowSchema, nm::Symbol)
if nm === :format
return unsafe_string(getfield(x, :format))
elseif nm === :name
return unsafe_string(getfield(x, :name))
elseif nm === :metadata
return readmetadata(getfield(x, :metadata))
elseif nm === :flags
return getfield(x, :flags)
elseif nm === :n_children
return getfield(x, :n_children)
elseif nm === :children
c = getfield(x, :children)
return c == C_NULL ? CArrowSchema[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children))
elseif nm === :dictionary
d = getfield(x, :dictionary)
return d == C_NULL ? nothing : unsafe_load(d)
end
error("unknown property requested: $nm")
end

mutable struct ArrowSchema
format::String
name::String
metadata::Dict{String, String}
flags::Int64
n_children::Int64
children::Vector{ArrowSchema}
dictionary::Union{Nothing, ArrowSchema}
carrowschema::Ref{CArrowSchema}
end

ArrowSchema(s::Ref{CArrowSchema}) = ArrowSchema(s[].format, s[].name, s[].metadata, s[].flags, s[].n_children, map(ArrowSchema, s[].children), s[].dictionary === nothing ? nothing : ArrowSchema(s[].dictionary), s)
ArrowSchema(s::CArrowSchema) = ArrowSchema(s.format, s.name, s.metadata, s.flags, s.n_children, map(ArrowSchema, s.children), s.dictionary === nothing ? nothing : ArrowSchema(s.dictionary), Ref{CArrowSchema}())

function getschema(f)
schref = Ref{CArrowSchema}()
ptr = Base.unsafe_convert(Ptr{CArrowSchema}, schref)
f(ptr)
sch = ArrowSchema(schref)
finalizer(sch) do x
r = getfield(x.carrowschema[], :release)
if r != C_NULL
ccall(r, Cvoid, (Ptr{CArrowSchema},), x.carrowschema)
end
end
return sch
end

struct CArrowArray
length::Int64
null_count::Int64
offset::Int64
n_buffers::Int64
n_children::Int64
buffers::Ptr{Ptr{UInt8}}
children::Ptr{Ptr{CArrowArray}}
dictionary::Ptr{CArrowArray}
release::Ptr{Cvoid}
private_data::Ptr{Cvoid}
end

CArrowArray() = CArrowArray(0, 0, 0, 0, 0, C_NULL, C_NULL, C_NULL, C_NULL, C_NULL)

Base.propertynames(::CArrowArray) = (:length, :null_count, :offset, :n_buffers, :n_children, :buffers, :children, :dictionary)

function Base.getproperty(x::CArrowArray, nm::Symbol)
if nm === :length
return getfield(x, :length)
elseif nm === :null_count
return getfield(x, :null_count)
elseif nm === :offset
return getfield(x, :offset)
elseif nm === :n_buffers
return getfield(x, :n_buffers)
elseif nm === :n_children
return getfield(x, :n_children)
elseif nm === :buffers
b = getfield(x, :buffers)
return b == C_NULL ? Ptr{UInt8}[] : unsafe_wrap(Array, b, getfield(x, :n_buffers))
elseif nm === :children
c = getfield(x, :children)
return c == C_NULL ? CArrowArray[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children))
elseif nm === :dictionary
d = getfield(x, :dictionary)
return d == C_NULL ? nothing : unsafe_load(d)
end
error("unknown property requested: $nm")
end

mutable struct ArrowArray
length::Int64
null_count::Int64
offset::Int64
n_buffers::Int64
n_children::Int64
buffers::Vector{Ptr{UInt8}}
children::Vector{ArrowArray}
dictionary::Union{Nothing, ArrowArray}
carrowarray::Ref{CArrowArray}
end

ArrowArray(a::Ref{CArrowArray}) = ArrowArray(a[].length, a[].null_count, a[].offset, a[].n_buffers, a[].n_children, a[].buffers, map(ArrowArray, a[].children), a[].dictionary === nothing ? nothing : ArrowArray(a[].dictionary), a)
ArrowArray(a::CArrowArray) = ArrowArray(a.length, a.null_count, a.offset, a.n_buffers, a.n_children, a.buffers, map(ArrowArray, a.children), a.dictionary === nothing ? nothing : ArrowArray(a.dictionary), Ref{CArrowArray}())

function getarray(f)
arrref = Ref{CArrowArray}()
ptr = Base.unsafe_convert(Ptr{CArrowArray}, arrref)
f(ptr)
arr = ArrowArray(arrref)
finalizer(arr) do x
r = getfield(x.carrowarray[], :release)
if r != C_NULL
ccall(r, Cvoid, (Ptr{CArrowArray},), x.carrowarray)
end
end
return arr
end

end # module

0 comments on commit 005c946

Please sign in to comment.