Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions cpp/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
class NumPyConverter {
public:
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
const std::shared_ptr<DataType>& type, bool from_pandas)
const std::shared_ptr<DataType>& type, bool from_pandas,
const compute::CastOptions& cast_options = compute::CastOptions())
: pool_(pool),
type_(type),
arr_(reinterpret_cast<PyArrayObject*>(arr)),
dtype_(PyArray_DESCR(arr_)),
mask_(nullptr),
from_pandas_(from_pandas),
cast_options_(cast_options),
null_bitmap_data_(nullptr),
null_count_(0) {
if (mo != nullptr && mo != Py_None) {
Expand Down Expand Up @@ -289,6 +291,7 @@ class NumPyConverter {
int itemsize_;

bool from_pandas_;
compute::CastOptions cast_options_;

// Used in visitor pattern
ArrayVector out_arrays_;
Expand Down Expand Up @@ -319,7 +322,8 @@ namespace {
Status CastBuffer(const std::shared_ptr<DataType>& in_type,
const std::shared_ptr<Buffer>& input, const int64_t length,
const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
const std::shared_ptr<DataType>& out_type,
const compute::CastOptions& cast_options, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
// Must cast
auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
Expand All @@ -328,9 +332,6 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
std::shared_ptr<Array> casted_array;

compute::FunctionContext context(pool);
compute::CastOptions cast_options;
cast_options.allow_int_overflow = false;
cast_options.allow_time_truncate = false;

RETURN_NOT_OK(
compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
Expand Down Expand Up @@ -412,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));

if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
pool_, data));
}

return Status::OK();
Expand Down Expand Up @@ -465,14 +467,14 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, pool_, data));
type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(
CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
cast_options_, pool_, data));
}
}

Expand Down Expand Up @@ -512,14 +514,14 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, pool_, data));
type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(
CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
cast_options_, pool_, data));
}
}

Expand Down Expand Up @@ -770,6 +772,7 @@ Status NumPyConverter::Visit(const StructType& type) {

Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
std::shared_ptr<ChunkedArray>* out) {
if (!PyArray_Check(ao)) {
return Status::Invalid("Input object was not a NumPy array");
Expand All @@ -784,13 +787,19 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
return ConvertPySequence(ao, mo, py_options, out);
}

NumPyConverter converter(pool, ao, mo, type, from_pandas);
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
RETURN_NOT_OK(converter.Convert());
const auto& output_arrays = converter.result();
DCHECK_GT(output_arrays.size(), 0);
*out = std::make_shared<ChunkedArray>(output_arrays);
return Status::OK();
}

Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
std::shared_ptr<ChunkedArray>* out) {
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
}

} // namespace py
} // namespace arrow
18 changes: 18 additions & 0 deletions cpp/src/arrow/python/numpy_to_arrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include <memory>

#include "arrow/compute/kernels/cast.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand All @@ -45,6 +46,23 @@ namespace py {
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
/// whether values are null
/// \param[in] type a specific type to cast to, may be null
/// \param[in] cast_options casting options
/// \param[out] out a ChunkedArray, to accommodate chunked output
ARROW_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
std::shared_ptr<ChunkedArray>* out);

/// Safely convert NumPy arrays to Arrow. If target data type is not known,
/// pass a type with null.
///
/// \param[in] pool Memory pool for any memory allocations
/// \param[in] ao an ndarray with the array data
/// \param[in] mo an ndarray with a null mask (True is null), optional
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
/// whether values are null
/// \param[in] type a specific type to cast to, may be null
/// \param[out] out a ChunkedArray, to accommodate chunked output
ARROW_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
Expand Down
60 changes: 34 additions & 26 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@


cdef _sequence_to_array(object sequence, object mask, object size,
DataType type,
CMemoryPool* pool, c_bool from_pandas):
DataType type, CMemoryPool* pool, c_bool from_pandas):
cdef int64_t c_size
cdef PyConversionOptions options

Expand Down Expand Up @@ -50,10 +49,14 @@ cdef _is_array_like(obj):


cdef _ndarray_to_array(object values, object mask, DataType type,
c_bool from_pandas,
CMemoryPool* pool):
cdef shared_ptr[CChunkedArray] chunked_out
cdef shared_ptr[CDataType] c_type
c_bool from_pandas, c_bool safe, CMemoryPool* pool):
cdef:
shared_ptr[CChunkedArray] chunked_out
shared_ptr[CDataType] c_type
CCastOptions cast_options

cast_options.allow_int_overflow = not safe
cast_options.allow_time_truncate = not safe

dtype = values.dtype

Expand All @@ -66,7 +69,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,

with nogil:
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
c_type, &chunked_out))
c_type, cast_options, &chunked_out))

if chunked_out.get().num_chunks() > 1:
return pyarrow_wrap_chunked_array(chunked_out)
Expand All @@ -83,9 +86,8 @@ cdef inline DataType _ensure_type(object type):
return type


def array(object obj, type=None, mask=None,
MemoryPool memory_pool=None, size=None,
from_pandas=False):
def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
bint safe=True, MemoryPool memory_pool=None):
"""
Create pyarrow.Array instance from a Python object

Expand All @@ -94,14 +96,11 @@ def array(object obj, type=None, mask=None,
obj : sequence, iterable, ndarray or Series
If both type and size are specified may be a single use iterable. If
not strongly-typed, Arrow type will be inferred for resulting array
mask : array (boolean), optional
Indicate which values are null (True) or not null (False).
type : pyarrow.DataType
Explicit type to attempt to coerce to, otherwise will be inferred from
the data
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool
mask : array (boolean), optional
Indicate which values are null (True) or not null (False).
size : int64, optional
Size of the elements. If the imput is larger than size bail at this
length. For iterators, if size is larger than the input iterator this
Expand All @@ -113,6 +112,11 @@ def array(object obj, type=None, mask=None,
data. If passed, the mask tasks precendence, but if a value is unmasked
(not-null), but still null according to pandas semantics, then it is
null
safe : boolean, default True
Check for overflows or other unsafe conversions
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool

Notes
-----
Expand Down Expand Up @@ -158,13 +162,15 @@ def array(object obj, type=None, mask=None,
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
from_pandas=from_pandas,
from_pandas=from_pandas, safe=safe,
memory_pool=memory_pool)
else:
values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
type)
return _ndarray_to_array(values, mask, type, from_pandas, pool)
return _ndarray_to_array(values, mask, type, from_pandas, safe,
pool)
else:
# ConvertPySequence does strict conversion if type is explicitly passed
return _sequence_to_array(obj, mask, size, type, pool, from_pandas)


Expand Down Expand Up @@ -352,7 +358,7 @@ cdef class Array:
with nogil:
check_status(DebugPrint(deref(self.ap), 0))

def cast(self, object target_type, safe=True):
def cast(self, object target_type, bint safe=True):
"""
Cast array values to another data type.

Expand Down Expand Up @@ -439,7 +445,8 @@ cdef class Array:
return wrap_datum(out)

@staticmethod
def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None):
def from_pandas(obj, mask=None, type=None, bint safe=True,
MemoryPool memory_pool=None):
"""
Convert pandas.Series to an Arrow Array, using pandas's semantics about
what values indicate nulls. See pyarrow.array for more general
Expand All @@ -453,6 +460,8 @@ cdef class Array:
type : pyarrow.DataType
Explicit type to attempt to coerce to, otherwise will be inferred
from the data
safe : boolean, default True
Check for overflows or other unsafe conversions
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool
Expand All @@ -468,8 +477,8 @@ cdef class Array:
array : pyarrow.Array or pyarrow.ChunkedArray (if object data
overflows binary buffer)
"""
return array(obj, mask=mask, type=type, memory_pool=memory_pool,
from_pandas=True)
return array(obj, mask=mask, type=type, safe=safe, from_pandas=True,
memory_pool=memory_pool)

def __reduce__(self):
return _restore_array, \
Expand Down Expand Up @@ -597,9 +606,8 @@ cdef class Array:

return pyarrow_wrap_array(result)

def to_pandas(self, c_bool strings_to_categorical=False,
c_bool zero_copy_only=False,
c_bool integer_object_nulls=False):
def to_pandas(self, bint strings_to_categorical=False,
bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert to a NumPy array object suitable for use in pandas.

Expand Down Expand Up @@ -1051,8 +1059,8 @@ cdef class DictionaryArray(Array):
return self._indices

@staticmethod
def from_arrays(indices, dictionary, mask=None, ordered=False,
from_pandas=False, safe=True,
def from_arrays(indices, dictionary, mask=None, bint ordered=False,
bint from_pandas=False, bint safe=True,
MemoryPool memory_pool=None):
"""
Construct Arrow DictionaryArray from array of indices (must be
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
const shared_ptr[CDataType]& type,
shared_ptr[CChunkedArray]* out)

CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
c_bool from_pandas,
const shared_ptr[CDataType]& type,
const CCastOptions& cast_options,
shared_ptr[CChunkedArray]* out)

CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
shared_ptr[CTensor]* out)

Expand Down
25 changes: 23 additions & 2 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,13 +479,18 @@ def test_string_from_buffers():

def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
expected = pa.array(out_data, type=out_type)

# check casting an already created array
in_arr = pa.array(in_data, type=in_type)

casted = in_arr.cast(out_type, safe=safe)
expected = pa.array(out_data, type=out_type)
assert casted.equals(expected)

# constructing an array with out type which optionally involves casting
# for more see ARROW-1949
in_arr = pa.array(in_data, type=out_type, safe=safe)
assert in_arr.equals(expected)


def test_cast_integers_safe():
safe_cases = [
Expand Down Expand Up @@ -573,6 +578,22 @@ def test_cast_timestamp_unit():
result = arr.cast(target, safe=False)
assert result.equals(expected)

# ARROW-1949
series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
expected = pa.array([0, 0, 1], type=pa.timestamp('us'))

with pytest.raises(ValueError):
pa.array(series, type=pa.timestamp('us'))

with pytest.raises(ValueError):
pa.Array.from_pandas(series, type=pa.timestamp('us'))

result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
assert result.equals(expected)

result = pa.array(series, type=pa.timestamp('us'), safe=False)
assert result.equals(expected)


def test_cast_signed_to_unsigned():
safe_cases = [
Expand Down