Skip to content

Commit

Permalink
Remove skip when numpy on test_convert_builtin.py and add individual …
Browse files Browse the repository at this point in the history
…test skips
  • Loading branch information
raulcd committed Aug 1, 2024
1 parent 3584d5e commit ef97ad3
Showing 1 changed file with 92 additions and 55 deletions.
147 changes: 92 additions & 55 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,25 @@
try:
import numpy as np
except ImportError:
pytest.skip(reason="Failures on test collection due to numpy NOT enabled",
allow_module_level=True)
np = None

from pyarrow.pandas_compat import _pandas_api # noqa
import pyarrow as pa
import pyarrow.tests.strategies as past


int_type_pairs = [
(np.int8, pa.int8()),
(np.int16, pa.int16()),
(np.int32, pa.int32()),
(np.int64, pa.int64()),
(np.uint8, pa.uint8()),
(np.uint16, pa.uint16()),
(np.uint32, pa.uint32()),
(np.uint64, pa.uint64())]
("int8", pa.int8()),
("int16", pa.int16()),
("int32", pa.int32()),
("int64", pa.int64()),
("uint8", pa.uint8()),
("uint16", pa.uint16()),
("uint32", pa.uint32()),
("uint64", pa.uint64())]


np_int_types, pa_int_types = zip(*int_type_pairs)
np_str_int_types, pa_int_types = zip(*int_type_pairs)


class StrangeIterable:
Expand Down Expand Up @@ -178,7 +177,9 @@ def _as_set(xs):
return set(xs)


SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
SEQUENCE_TYPES = [_as_list, _as_tuple]
if np is not None:
SEQUENCE_TYPES.append(_as_numpy_array)
ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES

Expand Down Expand Up @@ -221,6 +222,7 @@ def test_sequence_boolean(seq):
assert arr.to_pylist() == expected


@pytest.mark.numpy
@parametrize_with_sequence_types
def test_sequence_numpy_boolean(seq):
expected = [np.bool_(True), None, np.bool_(False), None]
Expand All @@ -229,6 +231,7 @@ def test_sequence_numpy_boolean(seq):
assert arr.to_pylist() == [True, None, False, None]


@pytest.mark.numpy
@parametrize_with_sequence_types
def test_sequence_mixed_numpy_python_bools(seq):
values = np.array([True, False])
Expand Down Expand Up @@ -285,15 +288,16 @@ def test_list_with_non_list(seq):
@pytest.mark.parametrize("factory", [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_nested_arrays(seq, factory):
arr = pa.array(seq([np.array([], dtype=np.int64),
np.array([1, 2], dtype=np.int64), None]),
arr = pa.array(seq([pa.array([], type=pa.int64()),
pa.array([1, 2], type=pa.int64()), None]),
type=factory(pa.int64()))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == factory(pa.int64())
assert arr.to_pylist() == [[], [1, 2], None]


@pytest.mark.numpy
@parametrize_with_sequence_types
def test_nested_fixed_size_list(seq):
# sequence of lists
Expand Down Expand Up @@ -338,10 +342,12 @@ def test_sequence_all_none(seq):
assert arr.to_pylist() == [None, None]


@pytest.mark.numpy
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
np_str_scalar, pa_type = np_scalar_pa_type
np_scalar = getattr(np, np_str_scalar)
expected = [1, None, 3, None,
np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
arr = pa.array(seq(expected), type=pa_type)
Expand All @@ -351,12 +357,12 @@ def test_sequence_integer(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected


@pytest.mark.numpy
@parametrize_with_collections_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
@pytest.mark.parametrize("pa_type", pa_int_types)
def test_sequence_integer_np_nan(seq, pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
_, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([np.nan]), type=pa_type, from_pandas=False)

Expand All @@ -368,12 +374,12 @@ def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected


@pytest.mark.numpy
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
@pytest.mark.parametrize("pa_type", pa_int_types)
def test_sequence_integer_nested_np_nan(seq, pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
_, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)

Expand All @@ -395,10 +401,12 @@ def test_sequence_integer_inferred(seq):
assert arr.to_pylist() == expected


@pytest.mark.numpy
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
np_str_scalar, pa_type = np_scalar_pa_type
np_scalar = getattr(np, np_str_scalar)
expected = [np_scalar(1), None, np_scalar(3), None,
np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
Expand All @@ -409,10 +417,12 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected


@pytest.mark.numpy
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
np_str_scalar, pa_type = np_scalar_pa_type
np_scalar = getattr(np, np_str_scalar)
expected = [np_scalar(1), None, np_scalar(3), None]
expected += [np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
Expand All @@ -438,6 +448,7 @@ def test_broken_integers(seq):
pa.array(seq(data), type=pa.int64())


@pytest.mark.numpy
def test_numpy_scalars_mixed_type():
# ARROW-4324
data = [np.int32(10), np.float32(0.5)]
Expand All @@ -452,6 +463,7 @@ def test_numpy_scalars_mixed_type():
assert arr.equals(expected)


@pytest.mark.numpy
@pytest.mark.xfail(reason="Type inference for uint64 not implemented",
raises=OverflowError)
def test_uint64_max_convert():
Expand Down Expand Up @@ -495,7 +507,7 @@ def test_integer_from_string_error(seq, typ):

def test_convert_with_mask():
data = [1, 2, 3, 4, 5]
mask = np.array([False, True, False, False, True])
mask = [False, True, False, False, True]

result = pa.array(data, mask=mask)
expected = pa.array([1, None, 3, 4, None])
Expand Down Expand Up @@ -563,6 +575,7 @@ def test_double_integer_coerce_representable_range():
pa.array(invalid_values2)


@pytest.mark.numpy
def test_float32_integer_coerce_representable_range():
f32 = np.float32
valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
Expand Down Expand Up @@ -591,14 +604,16 @@ def test_mixed_sequence_errors():
pa.array([1.5, 'foo'])


@pytest.mark.numpy
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar,pa_type", [
(np.float16, pa.float16()),
(np.float32, pa.float32()),
(np.float64, pa.float64())
@pytest.mark.parametrize("np_str_scalar,pa_type", [
("float16", pa.float16()),
("float32", pa.float32()),
("float64", pa.float64())
])
@pytest.mark.parametrize("from_pandas", [True, False])
def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
def test_sequence_numpy_double(seq, np_str_scalar, pa_type, from_pandas):
np_scalar = getattr(np, np_str_scalar)
data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
arr = pa.array(seq(data), from_pandas=from_pandas)
assert len(arr) == 6
Expand All @@ -620,27 +635,29 @@ def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
assert np.isnan(arr.to_pylist()[5])


@pytest.mark.numpy
@pytest.mark.parametrize("from_pandas", [True, False])
@pytest.mark.parametrize("inner_seq", [np.array, list])
def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
def test_ndarray_nested_numpy_double(from_pandas):
# ARROW-2806
data = np.array([
inner_seq([1., 2.]),
inner_seq([1., 2., 3.]),
inner_seq([np.nan]),
None
], dtype=object)
arr = pa.array(data, from_pandas=from_pandas)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.list_(pa.float64())
if from_pandas:
assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
else:
np.testing.assert_equal(arr.to_pylist(),
[[1., 2.], [1., 2., 3.], [np.nan], None])
for inner_seq in (np.array, list):
data = np.array([
inner_seq([1., 2.]),
inner_seq([1., 2., 3.]),
inner_seq([np.nan]),
None
], dtype=object)
arr = pa.array(data, from_pandas=from_pandas)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.list_(pa.float64())
if from_pandas:
assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
else:
np.testing.assert_equal(arr.to_pylist(),
[[1., 2.], [1., 2., 3.], [np.nan], None])


@pytest.mark.numpy
def test_nested_ndarray_in_object_array():
# ARROW-4350
arr = np.empty(2, dtype=object)
Expand Down Expand Up @@ -668,6 +685,7 @@ def test_nested_ndarray_in_object_array():
assert result.to_pylist() == [[[1], [2]], [[1], [2]]]


@pytest.mark.numpy
@pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
"not yet implemented"),
raises=AssertionError)
Expand All @@ -686,6 +704,7 @@ def test_multidimensional_ndarray_as_nested_list():
assert result.equals(expected)


@pytest.mark.numpy
@pytest.mark.parametrize(('data', 'value_type'), [
([True, False], pa.bool_()),
([None, None], pa.null()),
Expand Down Expand Up @@ -715,6 +734,7 @@ def test_list_array_from_object_ndarray(data, value_type):
assert arr.to_pylist() == [data]


@pytest.mark.numpy
@pytest.mark.parametrize(('data', 'value_type'), [
([[1, 2], [3]], pa.list_(pa.int64())),
([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
Expand All @@ -734,13 +754,14 @@ def test_array_ignore_nan_from_pandas():
# See ARROW-4324, this reverts logic that was introduced in
# ARROW-2240
with pytest.raises(ValueError):
pa.array([np.nan, 'str'])
pa.array([float("nan"), 'str'])

arr = pa.array([np.nan, 'str'], from_pandas=True)
arr = pa.array([float("nan"), 'str'], from_pandas=True)
expected = pa.array([None, 'str'])
assert arr.equals(expected)


@pytest.mark.numpy
def test_nested_ndarray_different_dtypes():
data = [
np.array([1, 2, 3], dtype='int64'),
Expand Down Expand Up @@ -1242,6 +1263,7 @@ def test_sequence_timestamp_out_of_bounds_nanosecond():
assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)


@pytest.mark.numpy
def test_sequence_numpy_timestamp():
data = [
np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
Expand Down Expand Up @@ -1411,14 +1433,25 @@ class CustomClass():
pa.array([1, CustomClass()], type=ty)


@pytest.mark.parametrize('np_scalar', [True, False])
def test_sequence_duration(np_scalar):
def test_sequence_duration():
td1 = datetime.timedelta(2, 3601, 1)
td2 = datetime.timedelta(1, 100, 1000)
if np_scalar:
data = [np.timedelta64(td1), None, np.timedelta64(td2)]
else:
data = [td1, None, td2]
data = [td1, None, td2]

arr = pa.array(data)
assert len(arr) == 3
assert arr.type == pa.duration('us')
assert arr.null_count == 1
assert arr[0].as_py() == td1
assert arr[1].as_py() is None
assert arr[2].as_py() == td2


@pytest.mark.numpy
def test_sequence_duration_np_scalar():
td1 = datetime.timedelta(2, 3601, 1)
td2 = datetime.timedelta(1, 100, 1000)
data = [np.timedelta64(td1), None, np.timedelta64(td2)]

arr = pa.array(data)
assert len(arr) == 3
Expand Down Expand Up @@ -1484,6 +1517,7 @@ def test_sequence_duration_nested_lists_with_explicit_type(factory):
assert arr.to_pylist() == data


@pytest.mark.numpy
def test_sequence_duration_nested_lists_numpy():
td1 = datetime.timedelta(1, 1, 1000)
td2 = datetime.timedelta(1, 100)
Expand Down Expand Up @@ -1773,6 +1807,7 @@ def test_struct_from_dicts_bytes_keys():
]


@pytest.mark.numpy
def test_struct_from_tuples():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
Expand Down Expand Up @@ -1919,6 +1954,7 @@ def test_struct_from_mixed_sequence():
pa.array(data, type=ty)


@pytest.mark.numpy
def test_struct_from_dicts_inference():
expected_type = pa.struct([pa.field('a', pa.int64()),
pa.field('b', pa.string()),
Expand Down Expand Up @@ -1996,7 +2032,7 @@ def test_structarray_from_arrays_coerce():


def test_decimal_array_with_none_and_nan():
values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
values = [decimal.Decimal('1.234'), None, float("nan"), decimal.Decimal('nan')]

with pytest.raises(TypeError):
# ARROW-6227: Without from_pandas=True, NaN is considered a float
Expand Down Expand Up @@ -2502,6 +2538,7 @@ def test_array_accepts_pyarrow_scalar(seq, data, scalar_data, value_type):
assert expect.equals(result)


@pytest.mark.numpy
@parametrize_with_collections_types
def test_array_accepts_pyarrow_scalar_errors(seq):
sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
Expand Down

0 comments on commit ef97ad3

Please sign in to comment.