Skip to content

Commit 2b361fb

Browse files
BryanCutlerwesm
authored andcommitted
ARROW-3428: [Python] Fix from_pandas conversion from float to bool
When `from_pandas` converts data to boolean, the values are read into a `uint8_t` and then checked. When the values are floating point numbers, not all bits are checked which can cause incorrect results. Author: Bryan Cutler <cutlerb@gmail.com> Closes #2698 from BryanCutler/python-from_pandas-float-to-bool-ARROW-3428 and squashes the following commits: f3d4726 <Bryan Cutler> added test with fix that passes, but fails other tests
1 parent 3b61349 commit 2b361fb

File tree

4 files changed

+81
-44
lines changed

4 files changed

+81
-44
lines changed

cpp/src/arrow/compute/kernels/cast-test.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) {
138138
AssertBufferSame(*arr, *result, 1);
139139
}
140140

141+
TEST_F(TestCast, FromBoolean) {
142+
CastOptions options;
143+
144+
vector<bool> is_valid(20, true);
145+
is_valid[3] = false;
146+
147+
vector<bool> v1(is_valid.size(), true);
148+
vector<int32_t> e1(is_valid.size(), 1);
149+
for (size_t i = 0; i < v1.size(); ++i) {
150+
if (i % 3 == 1) {
151+
v1[i] = false;
152+
e1[i] = 0;
153+
}
154+
}
155+
156+
CheckCase<BooleanType, bool, Int32Type, int32_t>(boolean(), v1, is_valid, int32(), e1,
157+
options);
158+
}
159+
141160
TEST_F(TestCast, ToBoolean) {
142161
CastOptions options;
143162
for (auto type : kNumericTypes) {

cpp/src/arrow/python/numpy_to_arrow.cc

Lines changed: 29 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ namespace arrow {
6363

6464
using internal::checked_cast;
6565
using internal::CopyBitmap;
66+
using internal::GenerateBitsUnrolled;
6667

6768
namespace py {
6869

@@ -246,6 +247,11 @@ class NumPyConverter {
246247
return Status::OK();
247248
}
248249

250+
// Called before ConvertData to ensure Numpy input buffer is in expected
251+
// Arrow layout
252+
template <typename ArrowType>
253+
Status PrepareInputData(std::shared_ptr<Buffer>* data);
254+
249255
// ----------------------------------------------------------------------
250256
// Traditional visitor conversion for non-object arrays
251257

@@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po
407413
} // namespace
408414

409415
template <typename ArrowType>
410-
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
416+
inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
411417
if (is_strided()) {
412418
RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
419+
} else if (dtype_->type_num == NPY_BOOL) {
420+
int64_t nbytes = BitUtil::BytesForBits(length_);
421+
std::shared_ptr<Buffer> buffer;
422+
RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
423+
424+
Ndarray1DIndexer<uint8_t> values(arr_);
425+
int64_t i = 0;
426+
const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
427+
GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
428+
429+
*data = buffer;
413430
} else {
414431
// Can zero-copy
415432
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
416433
}
417434

435+
return Status::OK();
436+
}
437+
438+
template <typename ArrowType>
439+
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
440+
RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
441+
418442
std::shared_ptr<DataType> input_type;
419443
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
420444

@@ -426,38 +450,12 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
426450
return Status::OK();
427451
}
428452

429-
template <>
430-
inline Status NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
431-
int64_t nbytes = BitUtil::BytesForBits(length_);
432-
std::shared_ptr<Buffer> buffer;
433-
RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
434-
435-
Ndarray1DIndexer<uint8_t> values(arr_);
436-
437-
uint8_t* bitmap = buffer->mutable_data();
438-
439-
memset(bitmap, 0, nbytes);
440-
for (int i = 0; i < length_; ++i) {
441-
if (values[i] > 0) {
442-
BitUtil::SetBit(bitmap, i);
443-
}
444-
}
445-
446-
*data = buffer;
447-
return Status::OK();
448-
}
449-
450453
template <>
451454
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
452-
if (is_strided()) {
453-
RETURN_NOT_OK(CopyStridedArray<Date32Type>(arr_, length_, pool_, data));
454-
} else {
455-
// Can zero-copy
456-
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
457-
}
458-
459455
std::shared_ptr<DataType> input_type;
460456

457+
RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
458+
461459
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
462460
if (dtype_->type_num == NPY_DATETIME) {
463461
// If we have inbound datetime64[D] data, this needs to be downcasted
@@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
489487

490488
template <>
491489
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
492-
if (is_strided()) {
493-
RETURN_NOT_OK(CopyStridedArray<Date64Type>(arr_, length_, pool_, data));
494-
} else {
495-
// Can zero-copy
496-
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
497-
}
498-
499490
constexpr int64_t kMillisecondsInDay = 86400000;
500-
501491
std::shared_ptr<DataType> input_type;
502492

493+
RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
494+
503495
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
504496
if (dtype_->type_num == NPY_DATETIME) {
505497
// If we have inbound datetime64[D] data, this needs to be downcasted

cpp/src/arrow/python/type_traits.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ template <>
149149
struct arrow_traits<Type::BOOL> {
150150
static constexpr int npy_type = NPY_BOOL;
151151
static constexpr bool supports_nulls = false;
152+
typedef typename npy_traits<NPY_BOOL>::value_type T;
152153
};
153154

154155
#define INT_DECL(TYPE) \

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None,
113113
else:
114114
assert arr.null_count == (mask | values_nulls).sum()
115115

116-
if mask is None:
117-
tm.assert_series_equal(pd.Series(result), pd.Series(values),
118-
check_names=False)
119-
else:
120-
expected = pd.Series(np.ma.masked_array(values, mask=mask))
121-
tm.assert_series_equal(pd.Series(result), expected,
122-
check_names=False)
116+
if expected is None:
117+
if mask is None:
118+
expected = pd.Series(values)
119+
else:
120+
expected = pd.Series(np.ma.masked_array(values, mask=mask))
121+
122+
tm.assert_series_equal(pd.Series(result), expected, check_names=False)
123123

124124

125125
def _check_array_from_pandas_roundtrip(np_array, type=None):
@@ -559,6 +559,11 @@ def test_float_nulls_to_ints(self):
559559
assert table[0].to_pylist() == [1, 2, None]
560560
tm.assert_frame_equal(df, table.to_pandas())
561561

562+
def test_float_nulls_to_boolean(self):
563+
s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
564+
expected = pd.Series([False, True, True, None, True])
565+
_check_array_roundtrip(s, expected=expected, type=pa.bool_())
566+
562567
def test_integer_no_nulls(self):
563568
data = OrderedDict()
564569
fields = []
@@ -672,6 +677,26 @@ def test_boolean_nulls(self):
672677

673678
tm.assert_frame_equal(result, ex_frame)
674679

680+
def test_boolean_to_int(self):
681+
# test from dtype=bool
682+
s = pd.Series([True, True, False, True, True] * 2)
683+
expected = pd.Series([1, 1, 0, 1, 1] * 2)
684+
_check_array_roundtrip(s, expected=expected, type=pa.int64())
685+
686+
def test_boolean_objects_to_int(self):
687+
# test from dtype=object
688+
s = pd.Series([True, True, False, True, True] * 2, dtype=object)
689+
expected = pd.Series([1, 1, 0, 1, 1] * 2)
690+
expected_msg = 'Expected integer, got bool'
691+
with pytest.raises(pa.ArrowTypeError, match=expected_msg):
692+
_check_array_roundtrip(s, expected=expected, type=pa.int64())
693+
694+
def test_boolean_nulls_to_float(self):
695+
# test from dtype=object
696+
s = pd.Series([True, True, False, None, True] * 2)
697+
expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
698+
_check_array_roundtrip(s, expected=expected, type=pa.float64())
699+
675700
def test_float_object_nulls(self):
676701
arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
677702
df = pd.DataFrame({'floats': arr})

0 commit comments

Comments
 (0)