From dd8871e8fba02e2154c04eb56ee57e8c20f52ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 29 Aug 2018 19:52:26 +0200 Subject: [PATCH 1/6] wire CastOptions through the API --- cpp/src/arrow/python/numpy_to_arrow.cc | 35 ++++++++++++++------- cpp/src/arrow/python/numpy_to_arrow.h | 18 +++++++++++ python/pyarrow/array.pxi | 43 +++++++++++++++++--------- python/pyarrow/includes/libarrow.pxd | 6 ++++ python/pyarrow/tests/test_array.py | 7 +++-- 5 files changed, 81 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 502afc73766..853bbe3d522 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, - const std::shared_ptr& type, bool from_pandas) + const std::shared_ptr& type, bool from_pandas, + const compute::CastOptions& cast_options=compute::CastOptions()) : pool_(pool), type_(type), arr_(reinterpret_cast(arr)), dtype_(PyArray_DESCR(arr_)), mask_(nullptr), from_pandas_(from_pandas), + cast_options_(cast_options), null_bitmap_data_(nullptr), null_count_(0) { if (mo != nullptr && mo != Py_None) { @@ -289,6 +291,7 @@ class NumPyConverter { int itemsize_; bool from_pandas_; + compute::CastOptions cast_options_; // Used in visitor pattern ArrayVector out_arrays_; @@ -319,8 +322,9 @@ namespace { Status CastBuffer(const std::shared_ptr& in_type, const std::shared_ptr& input, const int64_t length, const std::shared_ptr& valid_bitmap, const int64_t null_count, - const std::shared_ptr& out_type, MemoryPool* pool, - std::shared_ptr* out) { + const std::shared_ptr& out_type, + const compute::CastOptions& cast_options, + MemoryPool* pool, std::shared_ptr* out) { // Must cast auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); @@ -328,9 +332,9 @@ Status CastBuffer(const std::shared_ptr& in_type, std::shared_ptr casted_array; compute::FunctionContext context(pool); - compute::CastOptions cast_options; - cast_options.allow_int_overflow = false; - cast_options.allow_time_truncate = false; + // compute::CastOptions cast_options; + // cast_options.allow_int_overflow = false; + // cast_options.allow_time_truncate = false; RETURN_NOT_OK( compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array)); @@ -412,7 +416,7 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); } return Status::OK(); @@ -465,14 +469,14 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, pool_, data)); + type_, cast_options_, pool_, data)); } } } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); } } @@ -512,14 +516,14 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, pool_, data)); + type_, cast_options_, pool_, data)); } } } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); } } @@ -770,6 +774,7 @@ Status NumPyConverter::Visit(const StructType& type) { Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, + const compute::CastOptions& cast_options, std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::Invalid("Input object was not a NumPy array"); @@ -784,7 +789,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return ConvertPySequence(ao, mo, py_options, out); } - NumPyConverter converter(pool, ao, mo, type, from_pandas); + NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); RETURN_NOT_OK(converter.Convert()); const auto& output_arrays = converter.result(); DCHECK_GT(output_arrays.size(), 0); @@ -792,5 +797,11 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return Status::OK(); } +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + std::shared_ptr* out) { + return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index bbdd5764fd0..7f00c16b669 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -25,6 +25,7 @@ #include #include "arrow/util/visibility.h" +#include "arrow/compute/kernels/cast.h" namespace arrow { @@ -36,6 +37,23 @@ class Status; namespace py { +/// Convert NumPy arrays to Arrow. If target data type is not known, pass a +/// type with null +/// +/// \param[in] pool Memory pool for any memory allocations +/// \param[in] ao an ndarray with the array data +/// \param[in] mo an ndarray with a null mask (True is null), optional +/// \param[in] from_pandas If true, use pandas's null sentinels to determine +/// whether values are null +/// \param[in] type a specific type to cast to, may be null +/// \param[in] cast_options casting options +/// \param[out] out a ChunkedArray, to accommodate chunked output +ARROW_EXPORT +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + const compute::CastOptions& cast_options, + std::shared_ptr* out); + /// Convert NumPy arrays to Arrow. If target data type is not known, pass a /// type with null /// diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 76a639abfcb..74cf4a9971b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -17,8 +17,8 @@ cdef _sequence_to_array(object sequence, object mask, object size, - DataType type, - CMemoryPool* pool, c_bool from_pandas): + DataType type, CMemoryPool* pool, c_bool from_pandas, + c_bool safe): cdef int64_t c_size cdef PyConversionOptions options @@ -50,10 +50,14 @@ cdef _is_array_like(obj): cdef _ndarray_to_array(object values, object mask, DataType type, - c_bool from_pandas, - CMemoryPool* pool): - cdef shared_ptr[CChunkedArray] chunked_out - cdef shared_ptr[CDataType] c_type + c_bool from_pandas, c_bool safe, CMemoryPool* pool): + cdef: + shared_ptr[CChunkedArray] chunked_out + shared_ptr[CDataType] c_type + CCastOptions cast_options + + cast_options.allow_int_overflow = not safe + cast_options.allow_time_truncate = not safe dtype = values.dtype @@ -66,7 +70,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type, with nogil: check_status(NdarrayToArrow(pool, values, mask, from_pandas, - c_type, &chunked_out)) + c_type, cast_options, &chunked_out)) if chunked_out.get().num_chunks() > 1: return pyarrow_wrap_chunked_array(chunked_out) @@ -85,7 +89,7 @@ cdef inline DataType _ensure_type(object type): def array(object obj, type=None, mask=None, MemoryPool memory_pool=None, size=None, - from_pandas=False): + from_pandas=False, safe=True): """ Create pyarrow.Array instance from a Python object @@ -113,6 +117,8 @@ def array(object obj, type=None, mask=None, data. If passed, the mask tasks precendence, but if a value is unmasked (not-null), but still null according to pandas semantics, then it is null + safe : boolean, default True + Check for overflows or other unsafe conversions Notes ----- @@ -158,14 +164,16 @@ def array(object obj, type=None, mask=None, return DictionaryArray.from_arrays( values.codes, values.categories.values, mask=mask, ordered=values.ordered, - from_pandas=from_pandas, + from_pandas=from_pandas, safe=safe, memory_pool=memory_pool) else: values, type = pdcompat.get_datetimetz_type(values, obj.dtype, type) - return _ndarray_to_array(values, mask, type, from_pandas, pool) + return _ndarray_to_array(values, mask, type, from_pandas, safe, + pool) else: - return _sequence_to_array(obj, mask, size, type, pool, from_pandas) + return _sequence_to_array(obj, mask, size, type, pool, from_pandas, + safe) def asarray(values, type=None): @@ -339,6 +347,10 @@ cdef class Array: "the `pyarrow.Array.from_*` functions instead." .format(self.__class__.__name__)) + # from_ptr + # from_ndarray or from_numpy + # from_sequence + cdef void init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() @@ -439,7 +451,8 @@ cdef class Array: return wrap_datum(out) @staticmethod - def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): + def from_pandas(obj, mask=None, type=None, safe=True, + MemoryPool memory_pool=None): """ Convert pandas.Series to an Arrow Array, using pandas's semantics about what values indicate nulls. See pyarrow.array for more general @@ -453,6 +466,8 @@ cdef class Array: type : pyarrow.DataType Explicit type to attempt to coerce to, otherwise will be inferred from the data + safe : boolean, default True + Check for overflows or other unsafe conversions memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool @@ -468,8 +483,8 @@ cdef class Array: array : pyarrow.Array or pyarrow.ChunkedArray (if object data overflows binary buffer) """ - return array(obj, mask=mask, type=type, memory_pool=memory_pool, - from_pandas=True) + return array(obj, mask=mask, type=type, safe=safe, from_pandas=True, + memory_pool=memory_pool) def __reduce__(self): return _restore_array, \ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7fe354dd894..8bbbfcfd661 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const shared_ptr[CDataType]& type, shared_ptr[CChunkedArray]* out) + CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, + c_bool from_pandas, + const shared_ptr[CDataType]& type, + const CCastOptions& cast_options, + shared_ptr[CChunkedArray]* out) + CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 4bebe3139a9..ddc9e6c21f0 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -479,13 +479,16 @@ def test_string_from_buffers(): def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case + expected = pa.array(out_data, type=out_type) in_arr = pa.array(in_data, type=in_type) - casted = in_arr.cast(out_type, safe=safe) - expected = pa.array(out_data, type=out_type) assert casted.equals(expected) + # ARROW-1949 + in_arr = pa.array(in_data, type=out_type, safe=safe) + assert in_arr.equals(expected) + def test_cast_integers_safe(): safe_cases = [ From 92ac3a92d8d2eba8dea93b6f634c6cd5affc3536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 16:07:40 +0200 Subject: [PATCH 2/6] tests for timestamp casts --- cpp/src/arrow/python/numpy_to_arrow.cc | 3 --- cpp/src/arrow/python/numpy_to_arrow.h | 4 ++-- python/pyarrow/array.pxi | 4 ---- python/pyarrow/tests/test_array.py | 20 +++++++++++++++++++- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 853bbe3d522..3e6c4a1b512 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -332,9 +332,6 @@ Status CastBuffer(const std::shared_ptr& in_type, std::shared_ptr casted_array; compute::FunctionContext context(pool); - // compute::CastOptions cast_options; - // cast_options.allow_int_overflow = false; - // cast_options.allow_time_truncate = false; RETURN_NOT_OK( compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array)); diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index 7f00c16b669..658dda6b14f 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -54,8 +54,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa const compute::CastOptions& cast_options, std::shared_ptr* out); -/// Convert NumPy arrays to Arrow. If target data type is not known, pass a -/// type with null +/// Safely convert NumPy arrays to Arrow. If target data type is not known, +/// pass a type with null. /// /// \param[in] pool Memory pool for any memory allocations /// \param[in] ao an ndarray with the array data diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 74cf4a9971b..b6c0b2900b2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -347,10 +347,6 @@ cdef class Array: "the `pyarrow.Array.from_*` functions instead." .format(self.__class__.__name__)) - # from_ptr - # from_ndarray or from_numpy - # from_sequence - cdef void init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ddc9e6c21f0..d4b582e06c4 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -481,11 +481,13 @@ def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case expected = pa.array(out_data, type=out_type) + # check casting an already created array in_arr = pa.array(in_data, type=in_type) casted = in_arr.cast(out_type, safe=safe) assert casted.equals(expected) - # ARROW-1949 + # constructing an array with out type which optionally involves casting + # for more see ARROW-1949 in_arr = pa.array(in_data, type=out_type, safe=safe) assert in_arr.equals(expected) @@ -576,6 +578,22 @@ def test_cast_timestamp_unit(): result = arr.cast(target, safe=False) assert result.equals(expected) + # ARROW-1949 + series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)]) + expected = pa.array([0, 0, 1], type=pa.timestamp('us')) + + with pytest.raises(ValueError): + pa.array(series, type=pa.timestamp('us')) + + with pytest.raises(ValueError): + pa.Array.from_pandas(series, type=pa.timestamp('us')) + + result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False) + assert result.equals(expected) + + result = pa.array(series, type=pa.timestamp('us'), safe=False) + assert result.equals(expected) + def test_cast_signed_to_unsigned(): safe_cases = [ From fff89aaa1aad8298a5ffce79bb452b38a425424d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 16:11:00 +0200 Subject: [PATCH 3/6] lint --- cpp/src/arrow/python/numpy_to_arrow.cc | 11 +++++++---- cpp/src/arrow/python/numpy_to_arrow.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 3e6c4a1b512..6ffedd85335 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -174,7 +174,7 @@ class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, const std::shared_ptr& type, bool from_pandas, - const compute::CastOptions& cast_options=compute::CastOptions()) + const compute::CastOptions& cast_options = compute::CastOptions()) : pool_(pool), type_(type), arr_(reinterpret_cast(arr)), @@ -413,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } return Status::OK(); @@ -473,7 +474,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); + CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, + pool_, data)); } } @@ -520,7 +522,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, pool_, data)); + CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, + pool_, data)); } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index 658dda6b14f..c9a0a8ea8d9 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -55,7 +55,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa std::shared_ptr* out); /// Safely convert NumPy arrays to Arrow. If target data type is not known, -/// pass a type with null. +/// pass a type with null. /// /// \param[in] pool Memory pool for any memory allocations /// \param[in] ao an ndarray with the array data From e838a14db7ffbbe218be9e3f39d88e46886ac4be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 16:47:46 +0200 Subject: [PATCH 4/6] check-format --- cpp/src/arrow/python/numpy_to_arrow.cc | 18 ++++++++---------- cpp/src/arrow/python/numpy_to_arrow.h | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 6ffedd85335..ece00c286ea 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -323,8 +323,8 @@ Status CastBuffer(const std::shared_ptr& in_type, const std::shared_ptr& input, const int64_t length, const std::shared_ptr& valid_bitmap, const int64_t null_count, const std::shared_ptr& out_type, - const compute::CastOptions& cast_options, - MemoryPool* pool, std::shared_ptr* out) { + const compute::CastOptions& cast_options, MemoryPool* pool, + std::shared_ptr* out) { // Must cast auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); @@ -413,8 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, + pool_, data)); } return Status::OK(); @@ -473,9 +473,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, - pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } } @@ -521,9 +520,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK( - CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, - pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index c9a0a8ea8d9..5e1c088264a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -24,8 +24,8 @@ #include -#include "arrow/util/visibility.h" #include "arrow/compute/kernels/cast.h" +#include "arrow/util/visibility.h" namespace arrow { From 70d6cae280f32884d2bd85cd6b456291b5b974e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 17:18:32 +0200 Subject: [PATCH 5/6] annotate boolean arguments as bint --- python/pyarrow/array.pxi | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b6c0b2900b2..bb562731c6b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -87,9 +87,8 @@ cdef inline DataType _ensure_type(object type): return type -def array(object obj, type=None, mask=None, - MemoryPool memory_pool=None, size=None, - from_pandas=False, safe=True): +def array(object obj, type=None, mask=None, size=None, bint from_pandas=False, + bint safe=True, MemoryPool memory_pool=None): """ Create pyarrow.Array instance from a Python object @@ -98,14 +97,11 @@ def array(object obj, type=None, mask=None, obj : sequence, iterable, ndarray or Series If both type and size are specified may be a single use iterable. If not strongly-typed, Arrow type will be inferred for resulting array - mask : array (boolean), optional - Indicate which values are null (True) or not null (False). type : pyarrow.DataType Explicit type to attempt to coerce to, otherwise will be inferred from the data - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). size : int64, optional Size of the elements. If the imput is larger than size bail at this length. For iterators, if size is larger than the input iterator this @@ -119,6 +115,9 @@ def array(object obj, type=None, mask=None, null safe : boolean, default True Check for overflows or other unsafe conversions + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool Notes ----- @@ -360,7 +359,7 @@ cdef class Array: with nogil: check_status(DebugPrint(deref(self.ap), 0)) - def cast(self, object target_type, safe=True): + def cast(self, object target_type, bint safe=True): """ Cast array values to another data type. @@ -447,7 +446,7 @@ cdef class Array: return wrap_datum(out) @staticmethod - def from_pandas(obj, mask=None, type=None, safe=True, + def from_pandas(obj, mask=None, type=None, bint safe=True, MemoryPool memory_pool=None): """ Convert pandas.Series to an Arrow Array, using pandas's semantics about @@ -608,9 +607,8 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self, c_bool strings_to_categorical=False, - c_bool zero_copy_only=False, - c_bool integer_object_nulls=False): + def to_pandas(self, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False): """ Convert to a NumPy array object suitable for use in pandas. @@ -1062,8 +1060,8 @@ cdef class DictionaryArray(Array): return self._indices @staticmethod - def from_arrays(indices, dictionary, mask=None, ordered=False, - from_pandas=False, safe=True, + def from_arrays(indices, dictionary, mask=None, bint ordered=False, + bint from_pandas=False, bint safe=True, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be From f352c477a1617724818a527eb644b4c2404aaeee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 02:37:52 +0200 Subject: [PATCH 6/6] remove safe flag from _sequence_to_array --- python/pyarrow/array.pxi | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bb562731c6b..f9a16a334c5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -17,8 +17,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, - DataType type, CMemoryPool* pool, c_bool from_pandas, - c_bool safe): + DataType type, CMemoryPool* pool, c_bool from_pandas): cdef int64_t c_size cdef PyConversionOptions options @@ -171,8 +170,8 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False, return _ndarray_to_array(values, mask, type, from_pandas, safe, pool) else: - return _sequence_to_array(obj, mask, size, type, pool, from_pandas, - safe) + # ConvertPySequence does strict conversion if type is explicitly passed + return _sequence_to_array(obj, mask, size, type, pool, from_pandas) def asarray(values, type=None):