Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Clone and copy source structs internally #1449

Merged
merged 9 commits into from
Mar 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,30 @@ pub unsafe fn make_array_from_raw(
let data = ArrayData::try_from(array)?;
Ok(make_array(data))
}

/// Exports an array to raw pointers of the C Data Interface provided by the consumer.
/// # Safety
/// Assumes that these pointers represent valid C Data Interfaces, both in memory
/// representation and lifetime via the `release` mechanism.
///
/// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and
/// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers.
/// Usually the raw pointers are provided by the array data consumer.
pub unsafe fn export_array_into_raw(
src: ArrayRef,
out_array: *mut ffi::FFI_ArrowArray,
out_schema: *mut ffi::FFI_ArrowSchema,
) -> Result<()> {
let data = src.data();
let array = ffi::FFI_ArrowArray::new(data);
let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?;

std::ptr::write_unaligned(out_array, array);
std::ptr::write_unaligned(out_schema, schema);

Ok(())
}

// Helper function for printing potentially long arrays.
pub(super) fn print_long_array<A, F>(
array: &A,
Expand Down
2 changes: 1 addition & 1 deletion arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ pub use self::cast::{

// ------------------------------ C Data Interface ---------------------------

pub use self::array::make_array_from_raw;
pub use self::array::{export_array_into_raw, make_array_from_raw};

#[cfg(test)]
mod tests {
Expand Down
94 changes: 82 additions & 12 deletions arrow/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
//!
//! ```rust
//! # use std::sync::Arc;
//! # use arrow::array::{Int32Array, Array, ArrayData, make_array_from_raw};
//! # use arrow::array::{Int32Array, Array, ArrayData, export_array_into_raw, make_array, make_array_from_raw};
//! # use arrow::error::{Result, ArrowError};
//! # use arrow::compute::kernels::arithmetic;
//! # use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
//! # use std::convert::TryFrom;
//! # fn main() -> Result<()> {
//! // create an array natively
Expand All @@ -51,7 +52,35 @@
//! // verify
//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)]));
//!
//! // Simulate if raw pointers are provided by consumer
//! let array = make_array(Int32Array::from(vec![Some(1), None, Some(3)]).data().clone());
//!
//! let out_array = Box::new(FFI_ArrowArray::empty());
//! let out_schema = Box::new(FFI_ArrowSchema::empty());
//! let out_array_ptr = Box::into_raw(out_array);
//! let out_schema_ptr = Box::into_raw(out_schema);
//!
//! // export array into raw pointers from consumer
//! unsafe { export_array_into_raw(array, out_array_ptr, out_schema_ptr)?; };
//!
//! // import it
//! let array = unsafe { make_array_from_raw(out_array_ptr, out_schema_ptr)? };
//!
//! // perform some operation
//! let array = array.as_any().downcast_ref::<Int32Array>().ok_or(
//! ArrowError::ParseError("Expects an int32".to_string()),
//! )?;
//! let array = arithmetic::add(&array, &array)?;
//!
//! // verify
//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)]));
//!
//! // (drop/release)
//! unsafe {
//! Box::from_raw(out_array_ptr);
//! Box::from_raw(out_schema_ptr);
//! }
//!
//! Ok(())
//! }
//! ```
Expand Down Expand Up @@ -107,7 +136,7 @@ bitflags! {
/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
/// This was created by bindgen
#[repr(C)]
#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct FFI_ArrowSchema {
format: *const c_char,
name: *const c_char,
Expand Down Expand Up @@ -336,7 +365,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
/// This was created by bindgen
#[repr(C)]
#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct FFI_ArrowArray {
pub(crate) length: i64,
pub(crate) null_count: i64,
Expand Down Expand Up @@ -396,7 +425,7 @@ impl FFI_ArrowArray {
/// # Safety
/// This method releases `buffers`. Consumers of this struct *must* call `release` before
/// releasing this struct, or contents in `buffers` leak.
fn new(data: &ArrayData) -> Self {
pub(crate) fn new(data: &ArrayData) -> Self {
// * insert the null buffer at the start
// * make all others `Option<Buffer>`.
let buffers = iter::once(data.null_buffer().cloned())
Expand Down Expand Up @@ -769,6 +798,9 @@ impl ArrowArray {
/// creates a new [ArrowArray] from two pointers. Used to import from the C Data Interface.
/// # Safety
/// See safety of [ArrowArray]
/// Note that this function will copy the content pointed by the raw pointers. Considering
/// the raw pointers can be from `Arc::into_raw` or other raw pointers, users must be responsible
/// on managing the allocation of the structs by themselves.
/// # Error
/// Errors if any of the pointers is null
pub unsafe fn try_from_raw(
Expand All @@ -781,11 +813,16 @@ impl ArrowArray {
.to_string(),
));
};
let ffi_array = (*array).clone();
let ffi_schema = (*schema).clone();

let array_mut = array as *mut FFI_ArrowArray;
let schema_mut = schema as *mut FFI_ArrowSchema;

let array_data = std::ptr::replace(array_mut, FFI_ArrowArray::empty());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually after thinking more on this, it seems this won't address the original problem neither. It basically just calls drop on FFI_ArrowArray (which is empty), but doesn't free the memory pointed by array and schema.

  +-------+
  | array |
  +-------+             +----------------------------+
     |                  |                            |
     +----------------->|      FFI_ArrowArray        | <- memory leaked
                        |                            |
                        +----------------------------+

For instance, if array and schema are from Arc::into_raw, then the memory allocated for the Arc will become dangling after this, and thus memory leak.

I'm thinking whether we'll need two APIs, one where we are able to take the ownership of the memory allocated for the array and schema (e.g., exported by Arc::into_raw from Rust itself), and one where we cannot take the ownership (e.g., memory was allocated by other languages such as Java), and thus requires the exporter to free the memory by itself later.

For the latter, we can clone the content for FFI_ArrowArray and FFI_ArrowSchema, and set the content of the original array and schema to be FFI_ArrowArray::empty() and FFI_ArrowSchema::empty() so that the exporter can just safely free the memory later.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For instance, if array and schema are from Arc::into_raw, then the memory allocated for the Arc will become dangling after this, and thus memory leak.

Currently if user try to export using into_raw and and don't import using from_raw (we can assume it's a normal case? as they export data to be used by others they don't need to import again), they might have memory leak.

After check the CPP-import implementation, I think this change is fine. We even can remove the two drop_in_place call as it seems unnecessary.

What we need is to redesign the ArrowArray::into_raw(), we can't use Arc::into_raw in the implementation

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea the drop_in_place here seems unnecessary.

I'm not sure if it's possible to redesign ArrowArray::into_raw though, since after exporting the array, we need to free up the memory allocated for FFI_ArrowArray. However this can only be done after the exported array is imported via FFI_ArrowArray::try_from_raw, which we don't know when.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is correct that with one single API, we cannot deal with both cases: raw pointers from Arc and not from Arc.

I'm not sure two separate APIs is good. With a single API, we can ask users to take care of releasing the raw pointers (either Arc or not) by themselves.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However this can only be done after the exported array is imported via FFI_ArrowArray::try_from_raw, which we don't know when.

  1. It may not be imported in rust via FFI_ArrowArray::try_from_raw, it can be imported by other language sdk
  2. We don't need to know, the user should import it somewhere or free them if needed, that's why we can't use Arc::into_raw because we don't know how user might use them. This API should be fired and done, shouldn't expect user always do something like try_from_raw

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another one allowing importer to allocate memory for exporter

If Rust side is importer, we already have it as we can do it now by creating empty structs, passing raw pointers to exporter.

If Java side is importer, we may need an export API which takes raw pointers from Java and replaces its content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, agreed. Do you plan to add the export API in this PR, or separately?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be straightforward to add, let me add it here. Thanks.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a new API export_into_raw. Please check it. Thanks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm trying to catch up with these discussions since I'll soon have a need to create Buffers from foreign memory. The cast to *mut followed by std::ptr::replace here doesn't look safe to me. When the pointer is coming from an Arc that seems to violate rust's unique ownership rules.

let schema_data = std::ptr::replace(schema_mut, FFI_ArrowSchema::empty());

Ok(Self {
array: Arc::new(ffi_array),
schema: Arc::new(ffi_schema),
array: Arc::new(array_data),
schema: Arc::new(schema_data),
})
}

Expand Down Expand Up @@ -822,10 +859,10 @@ impl<'a> ArrowArrayChild<'a> {
mod tests {
use super::*;
use crate::array::{
make_array, Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray, DecimalArray,
DictionaryArray, GenericBinaryArray, GenericListArray, GenericStringArray,
Int32Array, OffsetSizeTrait, StringOffsetSizeTrait, Time32MillisecondArray,
TimestampMillisecondArray,
export_array_into_raw, make_array, Array, ArrayData, BinaryOffsetSizeTrait,
BooleanArray, DecimalArray, DictionaryArray, GenericBinaryArray,
GenericListArray, GenericStringArray, Int32Array, OffsetSizeTrait,
StringOffsetSizeTrait, Time32MillisecondArray, TimestampMillisecondArray,
};
use crate::compute::kernels;
use crate::datatypes::{Field, Int8Type};
Expand Down Expand Up @@ -1164,4 +1201,37 @@ mod tests {
// (drop/release)
Ok(())
}

#[test]
fn test_export_array_into_raw() -> Result<()> {
let array = make_array(Int32Array::from(vec![1, 2, 3]).data().clone());

// Assume two raw pointers provided by the consumer
let out_array = Box::new(FFI_ArrowArray::empty());
let out_schema = Box::new(FFI_ArrowSchema::empty());
let out_array_ptr = Box::into_raw(out_array);
let out_schema_ptr = Box::into_raw(out_schema);

unsafe {
export_array_into_raw(array, out_array_ptr, out_schema_ptr)?;
}

// (simulate consumer) import it
unsafe {
let array = ArrowArray::try_from_raw(out_array_ptr, out_schema_ptr).unwrap();
let data = ArrayData::try_from(array)?;
let array = make_array(data);

// perform some operation
let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
let array = kernels::arithmetic::add(array, array).unwrap();

// verify
assert_eq!(array, Int32Array::from(vec![2, 4, 6]));

Box::from_raw(out_array_ptr);
Box::from_raw(out_schema_ptr);
}
Ok(())
}
}