Skip to content

Commit

Permalink
Move FFI to sub-crates (#3687)
Browse files Browse the repository at this point in the history
* Move FFI to sub-crates

* Use ptr::write instead of drop_in_place

* Add inline

* Avoid unnecessary clone

* Format

* Clippy

* Remove pub(crate)
  • Loading branch information
tustvold committed Feb 10, 2023
1 parent 560ebaa commit 3761ac5
Show file tree
Hide file tree
Showing 10 changed files with 1,058 additions and 997 deletions.
5 changes: 5 additions & 0 deletions arrow-data/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ bench = false
# this is not enabled by default as it is too computationally expensive
# but is run as part of our CI checks
force_validate = []
# Enable ffi support
ffi = ["arrow-schema/ffi"]

[package.metadata.docs.rs]
features = ["ffi"]

[dependencies]

Expand Down
285 changes: 285 additions & 0 deletions arrow-data/src/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).

use crate::{layout, ArrayData};
use arrow_buffer::Buffer;
use arrow_schema::DataType;
use std::ffi::c_void;

/// ABI-compatible struct for ArrowArray from C Data Interface
/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
///
/// ```
/// # use arrow_data::ArrayData;
/// # use arrow_data::ffi::FFI_ArrowArray;
/// fn export_array(array: &ArrayData) -> FFI_ArrowArray {
/// FFI_ArrowArray::new(array)
/// }
/// ```
#[repr(C)]
#[derive(Debug)]
pub struct FFI_ArrowArray {
length: i64,
null_count: i64,
offset: i64,
n_buffers: i64,
n_children: i64,
buffers: *mut *const c_void,
children: *mut *mut FFI_ArrowArray,
dictionary: *mut FFI_ArrowArray,
release: Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArray)>,
// When exported, this MUST contain everything that is owned by this array.
// for example, any buffer pointed to in `buffers` must be here, as well
// as the `buffers` pointer itself.
// In other words, everything in [FFI_ArrowArray] must be owned by
// `private_data` and can assume that they do not outlive `private_data`.
private_data: *mut c_void,
}

impl Drop for FFI_ArrowArray {
fn drop(&mut self) {
match self.release {
None => (),
Some(release) => unsafe { release(self) },
};
}
}

unsafe impl Send for FFI_ArrowArray {}
unsafe impl Sync for FFI_ArrowArray {}

// callback used to drop [FFI_ArrowArray] when it is exported
unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
if array.is_null() {
return;
}
let array = &mut *array;

// take ownership of `private_data`, therefore dropping it`
let private = Box::from_raw(array.private_data as *mut ArrayPrivateData);
for child in private.children.iter() {
let _ = Box::from_raw(*child);
}
if !private.dictionary.is_null() {
let _ = Box::from_raw(private.dictionary);
}

array.release = None;
}

struct ArrayPrivateData {
#[allow(dead_code)]
buffers: Vec<Option<Buffer>>,
buffers_ptr: Box<[*const c_void]>,
children: Box<[*mut FFI_ArrowArray]>,
dictionary: *mut FFI_ArrowArray,
}

impl FFI_ArrowArray {
/// creates a new `FFI_ArrowArray` from existing data.
/// # Memory Leaks
/// This method releases `buffers`. Consumers of this struct *must* call `release` before
/// releasing this struct, or contents in `buffers` leak.
pub fn new(data: &ArrayData) -> Self {
let data_layout = layout(data.data_type());

let buffers = if data_layout.can_contain_null_mask {
// * insert the null buffer at the start
// * make all others `Option<Buffer>`.
std::iter::once(data.null_buffer().cloned())
.chain(data.buffers().iter().map(|b| Some(b.clone())))
.collect::<Vec<_>>()
} else {
data.buffers().iter().map(|b| Some(b.clone())).collect()
};

// `n_buffers` is the number of buffers by the spec.
let n_buffers = {
data_layout.buffers.len() + {
// If the layout has a null buffer by Arrow spec.
// Note that even the array doesn't have a null buffer because it has
// no null value, we still need to count 1 here to follow the spec.
usize::from(data_layout.can_contain_null_mask)
}
} as i64;

let buffers_ptr = buffers
.iter()
.flat_map(|maybe_buffer| match maybe_buffer {
// note that `raw_data` takes into account the buffer's offset
Some(b) => Some(b.as_ptr() as *const c_void),
// This is for null buffer. We only put a null pointer for
// null buffer if by spec it can contain null mask.
None if data_layout.can_contain_null_mask => Some(std::ptr::null()),
None => None,
})
.collect::<Box<[_]>>();

let empty = vec![];
let (child_data, dictionary) = match data.data_type() {
DataType::Dictionary(_, _) => (
empty.as_slice(),
Box::into_raw(Box::new(FFI_ArrowArray::new(&data.child_data()[0]))),
),
_ => (data.child_data(), std::ptr::null_mut()),
};

let children = child_data
.iter()
.map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child))))
.collect::<Box<_>>();
let n_children = children.len() as i64;

// create the private data owning everything.
// any other data must be added here, e.g. via a struct, to track lifetime.
let mut private_data = Box::new(ArrayPrivateData {
buffers,
buffers_ptr,
children,
dictionary,
});

Self {
length: data.len() as i64,
null_count: data.null_count() as i64,
offset: data.offset() as i64,
n_buffers,
n_children,
buffers: private_data.buffers_ptr.as_mut_ptr(),
children: private_data.children.as_mut_ptr(),
dictionary,
release: Some(release_array),
private_data: Box::into_raw(private_data) as *mut c_void,
}
}

/// create an empty `FFI_ArrowArray`, which can be used to import data into
pub fn empty() -> Self {
Self {
length: 0,
null_count: 0,
offset: 0,
n_buffers: 0,
n_children: 0,
buffers: std::ptr::null_mut(),
children: std::ptr::null_mut(),
dictionary: std::ptr::null_mut(),
release: None,
private_data: std::ptr::null_mut(),
}
}

/// the length of the array
#[inline]
pub fn len(&self) -> usize {
self.length as usize
}

/// whether the array is empty
#[inline]
pub fn is_empty(&self) -> bool {
self.length == 0
}

/// Whether the array has been released
#[inline]
pub fn is_released(&self) -> bool {
self.release.is_none()
}

/// the offset of the array
#[inline]
pub fn offset(&self) -> usize {
self.offset as usize
}

/// the null count of the array
#[inline]
pub fn null_count(&self) -> usize {
self.null_count as usize
}

/// Returns the buffer at the provided index
///
/// # Panic
/// Panics if index exceeds the number of buffers or the buffer is not correctly aligned
#[inline]
pub fn buffer(&self, index: usize) -> *const u8 {
assert!(!self.buffers.is_null());
assert!(index < self.num_buffers());
// SAFETY:
// If buffers is not null must be valid for reads up to num_buffers
unsafe { std::ptr::read_unaligned((self.buffers as *mut *const u8).add(index)) }
}

/// Returns the number of buffers
#[inline]
pub fn num_buffers(&self) -> usize {
self.n_buffers as _
}

/// Returns the child at the provided index
#[inline]
pub fn child(&self, index: usize) -> &FFI_ArrowArray {
assert!(!self.children.is_null());
assert!(index < self.num_children());
// Safety:
// If children is not null must be valid for reads up to num_children
unsafe {
let child = std::ptr::read_unaligned(self.children.add(index));
child.as_ref().unwrap()
}
}

/// Returns the number of children
#[inline]
pub fn num_children(&self) -> usize {
self.n_children as _
}

/// Returns the dictionary if any
#[inline]
pub fn dictionary(&self) -> Option<&Self> {
// Safety:
// If dictionary is not null should be valid for reads of `Self`
unsafe { self.dictionary.as_ref() }
}
}

#[cfg(test)]
mod tests {
use super::*;

// More tests located in top-level arrow crate

#[test]
fn null_array_n_buffers() {
let data = ArrayData::new_null(&DataType::Null, 10);

let ffi_array = FFI_ArrowArray::new(&data);
assert_eq!(0, ffi_array.n_buffers);

let private_data =
unsafe { Box::from_raw(ffi_array.private_data as *mut ArrayPrivateData) };

assert_eq!(0, private_data.buffers_ptr.len());

Box::into_raw(private_data);
}
}
3 changes: 3 additions & 0 deletions arrow-data/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ pub mod transform;
pub mod bit_iterator;
pub mod bit_mask;
pub mod decimal;

#[cfg(feature = "ffi")]
pub mod ffi;
7 changes: 6 additions & 1 deletion arrow-schema/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,14 @@ bench = false

[dependencies]
serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true }
bitflags = { version = "1.2.1", default-features = false, optional = true }

[features]
default = []
# Enable ffi support
ffi = ["bitflags"]

[package.metadata.docs.rs]
features = ["ffi"]

[dev-dependencies]
serde_json = "1.0"
Expand Down
Loading

0 comments on commit 3761ac5

Please sign in to comment.