diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index f1f3d79eed61e..77568f3bcb244 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -25,8 +25,7 @@ numbers. Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` -implemented within pandas. It is not the default dtype for integers, and will not be inferred; -you must explicitly pass the dtype into :meth:`array` or :class:`Series`: +implemented within pandas. .. ipython:: python @@ -50,17 +49,34 @@ NumPy array. You can also pass the list-like object to the :class:`Series` constructor with the dtype. -.. ipython:: python +.. warning:: - s = pd.Series([1, 2, np.nan], dtype="Int64") - s + Currently :meth:`pandas.array` and :meth:`pandas.Series` use different + rules for dtype inference. :meth:`pandas.array` will infer a nullable- + integer dtype -By default (if you don't specify ``dtype``), NumPy is used, and you'll end -up with a ``float64`` dtype Series: + .. ipython:: python -.. ipython:: python + pd.array([1, None]) + pd.array([1, 2]) + + For backwards-compatibility, :class:`Series` infers these as either + integer or float dtype + + .. ipython:: python + + pd.Series([1, None]) + pd.Series([1, 2]) - pd.Series([1, 2, np.nan]) + We recommend explicitly providing the dtype to avoid confusion. + + .. ipython:: python + + pd.array([1, None], dtype="Int64") + pd.Series([1, None], dtype="Int64") + + In the future, we may provide an option for :class:`Series` to infer a + nullable-integer dtype. Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another @@ -68,6 +84,8 @@ dtype if needed. .. ipython:: python + s = pd.Series([1, 2, None], dtype="Int64") + # arithmetic s + 1 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b45bec37e84eb..470209a7f4a33 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -303,6 +303,38 @@ The following methods now also correctly output values for unobserved categories df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() +:meth:`pandas.array` inference changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`): + +1. String data (including missing values) now returns a :class:`arrays.StringArray`. +2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. +3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray` + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.array(["a", None]) + + ['a', None] + Length: 2, dtype: object + + >>> pd.array([1, None]) + + [1, None] + Length: 2, dtype: object + + +*pandas 1.0.0* + +.. ipython:: python + + pd.array(["a", None]) + pd.array([1, None]) + +As a reminder, you can specify the ``dtype`` to disable all inference. By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -408,7 +440,6 @@ Other API changes - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) -- .. _whatsnew_1000.api.documentation: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 41c15ab4de5e1..eb08a22b8c34f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1313,7 +1313,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: elif isinstance(val, str): if is_string_array(values, skipna=skipna): - return 'string' + return "string" elif isinstance(val, bytes): if is_bytes_array(values, skipna=skipna): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c0b08beead0ca..dc537d50b3419 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -94,10 +94,19 @@ def array( :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`str` :class:`pandas.arrays.StringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. + .. versionchanged:: 1.0.0 + + Pandas infers nullable-integer dtype for integer data, + string dtype for string data, and nullable-boolean dtype + for boolean data. + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -154,14 +163,6 @@ def array( ['a', 'b'] Length: 2, dtype: str32 - Or use the dedicated constructor for the array you're expecting, and - wrap that in a PandasArray - - >>> pd.array(np.array(['a', 'b'], dtype=' - ['a', 'b'] - Length: 2, dtype: str32 - Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` @@ -184,20 +185,28 @@ def array( Examples -------- - If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) - + [1, 2] - Length: 2, dtype: int64 + Length: 2, dtype: Int64 - Or the NumPy dtype can be specified + >>> pd.array([1, 2, np.nan]) + + [1, 2, NaN] + Length: 3, dtype: Int64 - >>> pd.array([1, 2], dtype=np.dtype("int32")) - - [1, 2] - Length: 2, dtype: int32 + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] You can use the string alias for `dtype` @@ -212,29 +221,24 @@ def array( [a, b, a] Categories (3, object): [a < b < c] - Because omitting the `dtype` passes the data through to NumPy, - a mixture of valid integers and NA will return a floating-point - NumPy array. + If pandas does not infer a dedicated extension type a + :class:`arrays.PandasArray` is returned. - >>> pd.array([1, 2, np.nan]) + >>> pd.array([1.1, 2.2]) - [1.0, 2.0, nan] - Length: 3, dtype: float64 - - To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify - the dtype: + [1.1, 2.2] + Length: 2, dtype: float64 - >>> pd.array([1, 2, np.nan], dtype='Int64') - - [1, 2, NaN] - Length: 3, dtype: Int64 + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` + as a NumPy dtype if you need to ensure there's no future change in + behavior. - Pandas will infer an ExtensionArray for some types of data: - - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) - - ['2000-01-01', '2000-01-01'] - Length: 2, dtype: period[D] + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. @@ -246,21 +250,26 @@ def array( """ from pandas.core.arrays import ( period_array, + BooleanArray, + IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, + StringArray, ) if lib.is_scalar(data): msg = "Cannot pass scalar '{}' to 'pandas.array'." raise ValueError(msg.format(data)) - data = extract_array(data, extract_numpy=True) - - if dtype is None and isinstance(data, ABCExtensionArray): + if dtype is None and isinstance( + data, (ABCSeries, ABCIndexClass, ABCExtensionArray) + ): dtype = data.dtype + data = extract_array(data, extract_numpy=True) + # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype @@ -270,7 +279,7 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) + inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) @@ -298,7 +307,14 @@ def array( # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) - # TODO(BooleanArray): handle this type + elif inferred_dtype == "string": + return StringArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "integer": + return IntegerArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 6f443f1841dcc..479f8dbad0418 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -19,14 +19,18 @@ "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + ( + np.array([1, 2], dtype="int64"), + None, + pd.arrays.IntegerArray._from_sequence([1, 2]), + ), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -113,6 +117,20 @@ # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # String + (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])), + ( + ["a", None], + pd.StringDtype(), + pd.arrays.StringArray._from_sequence(["a", None]), + ), + # Boolean + ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])), + ( + [True, None], + pd.BooleanDtype(), + pd.arrays.BooleanArray._from_sequence([True, None]), + ), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -139,15 +157,15 @@ def test_array(data, dtype, expected): def test_array_copy(): a = np.array([1, 2]) # default is to copy - b = pd.array(a) + b = pd.array(a, dtype=a.dtype) assert np.shares_memory(a, b._ndarray) is False # copy=True - b = pd.array(a, copy=True) + b = pd.array(a, dtype=a.dtype, copy=True) assert np.shares_memory(a, b._ndarray) is False # copy=False - b = pd.array(a, copy=False) + b = pd.array(a, dtype=a.dtype, copy=False) assert np.shares_memory(a, b._ndarray) is True @@ -211,6 +229,15 @@ def test_array_copy(): np.array([1, 2], dtype="m8[us]"), pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), + # integer + ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])), + ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])), + # string + (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), + (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), + # Boolean + ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])), + ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): @@ -241,7 +268,7 @@ def test_array_inference_fails(data): @pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): - pd.array(data) + pd.array(data, dtype="int64") def test_scalar_raises(): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 53e979d12a56d..75e86a2ee7ecc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -732,12 +732,17 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" assert result == "mixed" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = "string" - assert result == expected + assert result == "string" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" @pytest.mark.parametrize( "dtype, missing, skipna, expected", diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index d491e9f25c897..b27e7c217c4c2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -615,12 +615,12 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.array([1, 2, None, 3]) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) df2 = pd.DataFrame( { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], - "c": pd.array([1, 2, None, 3]), + "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) assert type(df["c"]._data.blocks[0]) == ObjectBlock diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index abe2ddf955ad8..551782d0b363a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1268,7 +1268,7 @@ def test_block_shape(): def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 - arr = pd.array([1, 2]) + arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 977e7ded1f1a7..92d72706f3dec 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -251,7 +251,7 @@ def __add__(self, other): @pytest.mark.parametrize( "values", [ - pd.array([1, 3, 2]), + pd.array([1, 3, 2], dtype="int64"), pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),