diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 502afc73766..ece00c286ea 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, - const std::shared_ptr& type, bool from_pandas) + const std::shared_ptr& type, bool from_pandas, + const compute::CastOptions& cast_options = compute::CastOptions()) : pool_(pool), type_(type), arr_(reinterpret_cast(arr)), dtype_(PyArray_DESCR(arr_)), mask_(nullptr), from_pandas_(from_pandas), + cast_options_(cast_options), null_bitmap_data_(nullptr), null_count_(0) { if (mo != nullptr && mo != Py_None) { @@ -289,6 +291,7 @@ class NumPyConverter { int itemsize_; bool from_pandas_; + compute::CastOptions cast_options_; // Used in visitor pattern ArrayVector out_arrays_; @@ -319,7 +322,8 @@ namespace { Status CastBuffer(const std::shared_ptr& in_type, const std::shared_ptr& input, const int64_t length, const std::shared_ptr& valid_bitmap, const int64_t null_count, - const std::shared_ptr& out_type, MemoryPool* pool, + const std::shared_ptr& out_type, + const compute::CastOptions& cast_options, MemoryPool* pool, std::shared_ptr* out) { // Must cast auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); @@ -328,9 +332,6 @@ Status CastBuffer(const std::shared_ptr& in_type, std::shared_ptr casted_array; compute::FunctionContext context(pool); - compute::CastOptions cast_options; - cast_options.allow_int_overflow = false; - cast_options.allow_time_truncate = false; RETURN_NOT_OK( compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array)); @@ -412,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, + pool_, data)); } return Status::OK(); @@ -465,14 +467,14 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, pool_, data)); + type_, cast_options_, pool_, data)); } } } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } } @@ -512,14 +514,14 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, pool_, data)); + type_, cast_options_, pool_, data)); } } } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } } @@ -770,6 +772,7 @@ Status NumPyConverter::Visit(const StructType& type) { Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, + const compute::CastOptions& cast_options, std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::Invalid("Input object was not a NumPy array"); @@ -784,7 +787,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return ConvertPySequence(ao, mo, py_options, out); } - NumPyConverter converter(pool, ao, mo, type, from_pandas); + NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); RETURN_NOT_OK(converter.Convert()); const auto& output_arrays = converter.result(); DCHECK_GT(output_arrays.size(), 0); @@ -792,5 +795,11 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return Status::OK(); } +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + std::shared_ptr* out) { + return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index bbdd5764fd0..5e1c088264a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -24,6 +24,7 @@ #include +#include "arrow/compute/kernels/cast.h" #include "arrow/util/visibility.h" namespace arrow { @@ -45,6 +46,23 @@ namespace py { /// \param[in] from_pandas If true, use pandas's null sentinels to determine /// whether values are null /// \param[in] type a specific type to cast to, may be null +/// \param[in] cast_options casting options +/// \param[out] out a ChunkedArray, to accommodate chunked output +ARROW_EXPORT +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + const compute::CastOptions& cast_options, + std::shared_ptr* out); + +/// Safely convert NumPy arrays to Arrow. If target data type is not known, +/// pass a type with null. +/// +/// \param[in] pool Memory pool for any memory allocations +/// \param[in] ao an ndarray with the array data +/// \param[in] mo an ndarray with a null mask (True is null), optional +/// \param[in] from_pandas If true, use pandas's null sentinels to determine +/// whether values are null +/// \param[in] type a specific type to cast to, may be null /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_EXPORT Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 76a639abfcb..f9a16a334c5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -17,8 +17,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, - DataType type, - CMemoryPool* pool, c_bool from_pandas): + DataType type, CMemoryPool* pool, c_bool from_pandas): cdef int64_t c_size cdef PyConversionOptions options @@ -50,10 +49,14 @@ cdef _is_array_like(obj): cdef _ndarray_to_array(object values, object mask, DataType type, - c_bool from_pandas, - CMemoryPool* pool): - cdef shared_ptr[CChunkedArray] chunked_out - cdef shared_ptr[CDataType] c_type + c_bool from_pandas, c_bool safe, CMemoryPool* pool): + cdef: + shared_ptr[CChunkedArray] chunked_out + shared_ptr[CDataType] c_type + CCastOptions cast_options + + cast_options.allow_int_overflow = not safe + cast_options.allow_time_truncate = not safe dtype = values.dtype @@ -66,7 +69,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type, with nogil: check_status(NdarrayToArrow(pool, values, mask, from_pandas, - c_type, &chunked_out)) + c_type, cast_options, &chunked_out)) if chunked_out.get().num_chunks() > 1: return pyarrow_wrap_chunked_array(chunked_out) @@ -83,9 +86,8 @@ cdef inline DataType _ensure_type(object type): return type -def array(object obj, type=None, mask=None, - MemoryPool memory_pool=None, size=None, - from_pandas=False): +def array(object obj, type=None, mask=None, size=None, bint from_pandas=False, + bint safe=True, MemoryPool memory_pool=None): """ Create pyarrow.Array instance from a Python object @@ -94,14 +96,11 @@ def array(object obj, type=None, mask=None, obj : sequence, iterable, ndarray or Series If both type and size are specified may be a single use iterable. If not strongly-typed, Arrow type will be inferred for resulting array - mask : array (boolean), optional - Indicate which values are null (True) or not null (False). type : pyarrow.DataType Explicit type to attempt to coerce to, otherwise will be inferred from the data - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). size : int64, optional Size of the elements. If the imput is larger than size bail at this length. For iterators, if size is larger than the input iterator this @@ -113,6 +112,11 @@ def array(object obj, type=None, mask=None, data. If passed, the mask tasks precendence, but if a value is unmasked (not-null), but still null according to pandas semantics, then it is null + safe : boolean, default True + Check for overflows or other unsafe conversions + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool Notes ----- @@ -158,13 +162,15 @@ def array(object obj, type=None, mask=None, return DictionaryArray.from_arrays( values.codes, values.categories.values, mask=mask, ordered=values.ordered, - from_pandas=from_pandas, + from_pandas=from_pandas, safe=safe, memory_pool=memory_pool) else: values, type = pdcompat.get_datetimetz_type(values, obj.dtype, type) - return _ndarray_to_array(values, mask, type, from_pandas, pool) + return _ndarray_to_array(values, mask, type, from_pandas, safe, + pool) else: + # ConvertPySequence does strict conversion if type is explicitly passed return _sequence_to_array(obj, mask, size, type, pool, from_pandas) @@ -352,7 +358,7 @@ cdef class Array: with nogil: check_status(DebugPrint(deref(self.ap), 0)) - def cast(self, object target_type, safe=True): + def cast(self, object target_type, bint safe=True): """ Cast array values to another data type. @@ -439,7 +445,8 @@ cdef class Array: return wrap_datum(out) @staticmethod - def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): + def from_pandas(obj, mask=None, type=None, bint safe=True, + MemoryPool memory_pool=None): """ Convert pandas.Series to an Arrow Array, using pandas's semantics about what values indicate nulls. See pyarrow.array for more general @@ -453,6 +460,8 @@ cdef class Array: type : pyarrow.DataType Explicit type to attempt to coerce to, otherwise will be inferred from the data + safe : boolean, default True + Check for overflows or other unsafe conversions memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool @@ -468,8 +477,8 @@ cdef class Array: array : pyarrow.Array or pyarrow.ChunkedArray (if object data overflows binary buffer) """ - return array(obj, mask=mask, type=type, memory_pool=memory_pool, - from_pandas=True) + return array(obj, mask=mask, type=type, safe=safe, from_pandas=True, + memory_pool=memory_pool) def __reduce__(self): return _restore_array, \ @@ -597,9 +606,8 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self, c_bool strings_to_categorical=False, - c_bool zero_copy_only=False, - c_bool integer_object_nulls=False): + def to_pandas(self, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False): """ Convert to a NumPy array object suitable for use in pandas. @@ -1051,8 +1059,8 @@ cdef class DictionaryArray(Array): return self._indices @staticmethod - def from_arrays(indices, dictionary, mask=None, ordered=False, - from_pandas=False, safe=True, + def from_arrays(indices, dictionary, mask=None, bint ordered=False, + bint from_pandas=False, bint safe=True, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7fe354dd894..8bbbfcfd661 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const shared_ptr[CDataType]& type, shared_ptr[CChunkedArray]* out) + CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, + c_bool from_pandas, + const shared_ptr[CDataType]& type, + const CCastOptions& cast_options, + shared_ptr[CChunkedArray]* out) + CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 4bebe3139a9..d4b582e06c4 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -479,13 +479,18 @@ def test_string_from_buffers(): def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case + expected = pa.array(out_data, type=out_type) + # check casting an already created array in_arr = pa.array(in_data, type=in_type) - casted = in_arr.cast(out_type, safe=safe) - expected = pa.array(out_data, type=out_type) assert casted.equals(expected) + # constructing an array with out type which optionally involves casting + # for more see ARROW-1949 + in_arr = pa.array(in_data, type=out_type, safe=safe) + assert in_arr.equals(expected) + def test_cast_integers_safe(): safe_cases = [ @@ -573,6 +578,22 @@ def test_cast_timestamp_unit(): result = arr.cast(target, safe=False) assert result.equals(expected) + # ARROW-1949 + series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)]) + expected = pa.array([0, 0, 1], type=pa.timestamp('us')) + + with pytest.raises(ValueError): + pa.array(series, type=pa.timestamp('us')) + + with pytest.raises(ValueError): + pa.Array.from_pandas(series, type=pa.timestamp('us')) + + result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False) + assert result.equals(expected) + + result = pa.array(series, type=pa.timestamp('us'), safe=False) + assert result.equals(expected) + def test_cast_signed_to_unsigned(): safe_cases = [