Skip to content

Commit a712165

Browse files
kszucsxhochy
authored andcommitted
ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts
Author: Krisztián Szűcs <szucs.krisztian@gmail.com> Closes #2497 from kszucs/ARROW-1949 and squashes the following commits: f352c47 <Krisztián Szűcs> remove safe flag from _sequence_to_array 70d6cae <Krisztián Szűcs> annotate boolean arguments as bint e838a14 <Krisztián Szűcs> check-format fff89aa <Krisztián Szűcs> lint 92ac3a9 <Krisztián Szűcs> tests for timestamp casts dd8871e <Krisztián Szűcs> wire CastOptions through the API
1 parent 20c0405 commit a712165

File tree

5 files changed

+103
-41
lines changed

5 files changed

+103
-41
lines changed

cpp/src/arrow/python/numpy_to_arrow.cc

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
173173
class NumPyConverter {
174174
public:
175175
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
176-
const std::shared_ptr<DataType>& type, bool from_pandas)
176+
const std::shared_ptr<DataType>& type, bool from_pandas,
177+
const compute::CastOptions& cast_options = compute::CastOptions())
177178
: pool_(pool),
178179
type_(type),
179180
arr_(reinterpret_cast<PyArrayObject*>(arr)),
180181
dtype_(PyArray_DESCR(arr_)),
181182
mask_(nullptr),
182183
from_pandas_(from_pandas),
184+
cast_options_(cast_options),
183185
null_bitmap_data_(nullptr),
184186
null_count_(0) {
185187
if (mo != nullptr && mo != Py_None) {
@@ -289,6 +291,7 @@ class NumPyConverter {
289291
int itemsize_;
290292

291293
bool from_pandas_;
294+
compute::CastOptions cast_options_;
292295

293296
// Used in visitor pattern
294297
ArrayVector out_arrays_;
@@ -319,7 +322,8 @@ namespace {
319322
Status CastBuffer(const std::shared_ptr<DataType>& in_type,
320323
const std::shared_ptr<Buffer>& input, const int64_t length,
321324
const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
322-
const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
325+
const std::shared_ptr<DataType>& out_type,
326+
const compute::CastOptions& cast_options, MemoryPool* pool,
323327
std::shared_ptr<Buffer>* out) {
324328
// Must cast
325329
auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
@@ -328,9 +332,6 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
328332
std::shared_ptr<Array> casted_array;
329333

330334
compute::FunctionContext context(pool);
331-
compute::CastOptions cast_options;
332-
cast_options.allow_int_overflow = false;
333-
cast_options.allow_time_truncate = false;
334335

335336
RETURN_NOT_OK(
336337
compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
@@ -412,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
412413
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
413414

414415
if (!input_type->Equals(*type_)) {
415-
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
416+
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
417+
pool_, data));
416418
}
417419

418420
return Status::OK();
@@ -465,14 +467,14 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
465467
if (!input_type->Equals(*type_)) {
466468
// The null bitmap was already computed in VisitNative()
467469
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
468-
type_, pool_, data));
470+
type_, cast_options_, pool_, data));
469471
}
470472
}
471473
} else {
472474
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
473475
if (!input_type->Equals(*type_)) {
474-
RETURN_NOT_OK(
475-
CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
476+
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
477+
cast_options_, pool_, data));
476478
}
477479
}
478480

@@ -512,14 +514,14 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
512514
if (!input_type->Equals(*type_)) {
513515
// The null bitmap was already computed in VisitNative()
514516
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
515-
type_, pool_, data));
517+
type_, cast_options_, pool_, data));
516518
}
517519
}
518520
} else {
519521
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
520522
if (!input_type->Equals(*type_)) {
521-
RETURN_NOT_OK(
522-
CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
523+
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
524+
cast_options_, pool_, data));
523525
}
524526
}
525527

@@ -770,6 +772,7 @@ Status NumPyConverter::Visit(const StructType& type) {
770772

771773
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
772774
const std::shared_ptr<DataType>& type,
775+
const compute::CastOptions& cast_options,
773776
std::shared_ptr<ChunkedArray>* out) {
774777
if (!PyArray_Check(ao)) {
775778
return Status::Invalid("Input object was not a NumPy array");
@@ -784,13 +787,19 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
784787
return ConvertPySequence(ao, mo, py_options, out);
785788
}
786789

787-
NumPyConverter converter(pool, ao, mo, type, from_pandas);
790+
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
788791
RETURN_NOT_OK(converter.Convert());
789792
const auto& output_arrays = converter.result();
790793
DCHECK_GT(output_arrays.size(), 0);
791794
*out = std::make_shared<ChunkedArray>(output_arrays);
792795
return Status::OK();
793796
}
794797

798+
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
799+
const std::shared_ptr<DataType>& type,
800+
std::shared_ptr<ChunkedArray>* out) {
801+
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
802+
}
803+
795804
} // namespace py
796805
} // namespace arrow

cpp/src/arrow/python/numpy_to_arrow.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include <memory>
2626

27+
#include "arrow/compute/kernels/cast.h"
2728
#include "arrow/util/visibility.h"
2829

2930
namespace arrow {
@@ -45,6 +46,23 @@ namespace py {
4546
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
4647
/// whether values are null
4748
/// \param[in] type a specific type to cast to, may be null
49+
/// \param[in] cast_options casting options
50+
/// \param[out] out a ChunkedArray, to accommodate chunked output
51+
ARROW_EXPORT
52+
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
53+
const std::shared_ptr<DataType>& type,
54+
const compute::CastOptions& cast_options,
55+
std::shared_ptr<ChunkedArray>* out);
56+
57+
/// Safely convert NumPy arrays to Arrow. If target data type is not known,
58+
/// pass a type with null.
59+
///
60+
/// \param[in] pool Memory pool for any memory allocations
61+
/// \param[in] ao an ndarray with the array data
62+
/// \param[in] mo an ndarray with a null mask (True is null), optional
63+
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
64+
/// whether values are null
65+
/// \param[in] type a specific type to cast to, may be null
4866
/// \param[out] out a ChunkedArray, to accommodate chunked output
4967
ARROW_EXPORT
5068
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,

python/pyarrow/array.pxi

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@
1717

1818

1919
cdef _sequence_to_array(object sequence, object mask, object size,
20-
DataType type,
21-
CMemoryPool* pool, c_bool from_pandas):
20+
DataType type, CMemoryPool* pool, c_bool from_pandas):
2221
cdef int64_t c_size
2322
cdef PyConversionOptions options
2423

@@ -50,10 +49,14 @@ cdef _is_array_like(obj):
5049

5150

5251
cdef _ndarray_to_array(object values, object mask, DataType type,
53-
c_bool from_pandas,
54-
CMemoryPool* pool):
55-
cdef shared_ptr[CChunkedArray] chunked_out
56-
cdef shared_ptr[CDataType] c_type
52+
c_bool from_pandas, c_bool safe, CMemoryPool* pool):
53+
cdef:
54+
shared_ptr[CChunkedArray] chunked_out
55+
shared_ptr[CDataType] c_type
56+
CCastOptions cast_options
57+
58+
cast_options.allow_int_overflow = not safe
59+
cast_options.allow_time_truncate = not safe
5760

5861
dtype = values.dtype
5962

@@ -66,7 +69,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
6669

6770
with nogil:
6871
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
69-
c_type, &chunked_out))
72+
c_type, cast_options, &chunked_out))
7073

7174
if chunked_out.get().num_chunks() > 1:
7275
return pyarrow_wrap_chunked_array(chunked_out)
@@ -83,9 +86,8 @@ cdef inline DataType _ensure_type(object type):
8386
return type
8487

8588

86-
def array(object obj, type=None, mask=None,
87-
MemoryPool memory_pool=None, size=None,
88-
from_pandas=False):
89+
def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
90+
bint safe=True, MemoryPool memory_pool=None):
8991
"""
9092
Create pyarrow.Array instance from a Python object
9193
@@ -94,14 +96,11 @@ def array(object obj, type=None, mask=None,
9496
obj : sequence, iterable, ndarray or Series
9597
If both type and size are specified may be a single use iterable. If
9698
not strongly-typed, Arrow type will be inferred for resulting array
97-
mask : array (boolean), optional
98-
Indicate which values are null (True) or not null (False).
9999
type : pyarrow.DataType
100100
Explicit type to attempt to coerce to, otherwise will be inferred from
101101
the data
102-
memory_pool : pyarrow.MemoryPool, optional
103-
If not passed, will allocate memory from the currently-set default
104-
memory pool
102+
mask : array (boolean), optional
103+
Indicate which values are null (True) or not null (False).
105104
size : int64, optional
106105
Size of the elements. If the imput is larger than size bail at this
107106
length. For iterators, if size is larger than the input iterator this
@@ -113,6 +112,11 @@ def array(object obj, type=None, mask=None,
113112
data. If passed, the mask tasks precendence, but if a value is unmasked
114113
(not-null), but still null according to pandas semantics, then it is
115114
null
115+
safe : boolean, default True
116+
Check for overflows or other unsafe conversions
117+
memory_pool : pyarrow.MemoryPool, optional
118+
If not passed, will allocate memory from the currently-set default
119+
memory pool
116120
117121
Notes
118122
-----
@@ -158,13 +162,15 @@ def array(object obj, type=None, mask=None,
158162
return DictionaryArray.from_arrays(
159163
values.codes, values.categories.values,
160164
mask=mask, ordered=values.ordered,
161-
from_pandas=from_pandas,
165+
from_pandas=from_pandas, safe=safe,
162166
memory_pool=memory_pool)
163167
else:
164168
values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
165169
type)
166-
return _ndarray_to_array(values, mask, type, from_pandas, pool)
170+
return _ndarray_to_array(values, mask, type, from_pandas, safe,
171+
pool)
167172
else:
173+
# ConvertPySequence does strict conversion if type is explicitly passed
168174
return _sequence_to_array(obj, mask, size, type, pool, from_pandas)
169175

170176

@@ -352,7 +358,7 @@ cdef class Array:
352358
with nogil:
353359
check_status(DebugPrint(deref(self.ap), 0))
354360

355-
def cast(self, object target_type, safe=True):
361+
def cast(self, object target_type, bint safe=True):
356362
"""
357363
Cast array values to another data type.
358364
@@ -439,7 +445,8 @@ cdef class Array:
439445
return wrap_datum(out)
440446

441447
@staticmethod
442-
def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None):
448+
def from_pandas(obj, mask=None, type=None, bint safe=True,
449+
MemoryPool memory_pool=None):
443450
"""
444451
Convert pandas.Series to an Arrow Array, using pandas's semantics about
445452
what values indicate nulls. See pyarrow.array for more general
@@ -453,6 +460,8 @@ cdef class Array:
453460
type : pyarrow.DataType
454461
Explicit type to attempt to coerce to, otherwise will be inferred
455462
from the data
463+
safe : boolean, default True
464+
Check for overflows or other unsafe conversions
456465
memory_pool : pyarrow.MemoryPool, optional
457466
If not passed, will allocate memory from the currently-set default
458467
memory pool
@@ -468,8 +477,8 @@ cdef class Array:
468477
array : pyarrow.Array or pyarrow.ChunkedArray (if object data
469478
overflows binary buffer)
470479
"""
471-
return array(obj, mask=mask, type=type, memory_pool=memory_pool,
472-
from_pandas=True)
480+
return array(obj, mask=mask, type=type, safe=safe, from_pandas=True,
481+
memory_pool=memory_pool)
473482

474483
def __reduce__(self):
475484
return _restore_array, \
@@ -597,9 +606,8 @@ cdef class Array:
597606

598607
return pyarrow_wrap_array(result)
599608

600-
def to_pandas(self, c_bool strings_to_categorical=False,
601-
c_bool zero_copy_only=False,
602-
c_bool integer_object_nulls=False):
609+
def to_pandas(self, bint strings_to_categorical=False,
610+
bint zero_copy_only=False, bint integer_object_nulls=False):
603611
"""
604612
Convert to a NumPy array object suitable for use in pandas.
605613
@@ -1051,8 +1059,8 @@ cdef class DictionaryArray(Array):
10511059
return self._indices
10521060

10531061
@staticmethod
1054-
def from_arrays(indices, dictionary, mask=None, ordered=False,
1055-
from_pandas=False, safe=True,
1062+
def from_arrays(indices, dictionary, mask=None, bint ordered=False,
1063+
bint from_pandas=False, bint safe=True,
10561064
MemoryPool memory_pool=None):
10571065
"""
10581066
Construct Arrow DictionaryArray from array of indices (must be

python/pyarrow/includes/libarrow.pxd

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
968968
const shared_ptr[CDataType]& type,
969969
shared_ptr[CChunkedArray]* out)
970970

971+
CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
972+
c_bool from_pandas,
973+
const shared_ptr[CDataType]& type,
974+
const CCastOptions& cast_options,
975+
shared_ptr[CChunkedArray]* out)
976+
971977
CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
972978
shared_ptr[CTensor]* out)
973979

python/pyarrow/tests/test_array.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,13 +479,18 @@ def test_string_from_buffers():
479479

480480
def _check_cast_case(case, safe=True):
481481
in_data, in_type, out_data, out_type = case
482+
expected = pa.array(out_data, type=out_type)
482483

484+
# check casting an already created array
483485
in_arr = pa.array(in_data, type=in_type)
484-
485486
casted = in_arr.cast(out_type, safe=safe)
486-
expected = pa.array(out_data, type=out_type)
487487
assert casted.equals(expected)
488488

489+
# constructing an array with out type which optionally involves casting
490+
# for more see ARROW-1949
491+
in_arr = pa.array(in_data, type=out_type, safe=safe)
492+
assert in_arr.equals(expected)
493+
489494

490495
def test_cast_integers_safe():
491496
safe_cases = [
@@ -573,6 +578,22 @@ def test_cast_timestamp_unit():
573578
result = arr.cast(target, safe=False)
574579
assert result.equals(expected)
575580

581+
# ARROW-1949
582+
series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
583+
expected = pa.array([0, 0, 1], type=pa.timestamp('us'))
584+
585+
with pytest.raises(ValueError):
586+
pa.array(series, type=pa.timestamp('us'))
587+
588+
with pytest.raises(ValueError):
589+
pa.Array.from_pandas(series, type=pa.timestamp('us'))
590+
591+
result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
592+
assert result.equals(expected)
593+
594+
result = pa.array(series, type=pa.timestamp('us'), safe=False)
595+
assert result.equals(expected)
596+
576597

577598
def test_cast_signed_to_unsigned():
578599
safe_cases = [

0 commit comments

Comments
 (0)