From 1c6b7868f65042cdfe5cbc36b4021030772fc959 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 3 Jan 2020 01:30:20 +0000 Subject: [PATCH 01/31] Merge master Co-authored-by: Luca Ionescu --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/json/_json.py | 4 ++++ pandas/tests/io/json/test_pandas.py | 11 +++++++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 788cb3db51d8a..09058efc386a6 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -545,6 +545,7 @@ Deprecations - :func:`pandas.json_normalize` is now exposed in the top-level namespace. Usage of ``json_normalize`` as ``pandas.io.json.json_normalize`` is now deprecated and it is recommended to use ``json_normalize`` as :func:`pandas.json_normalize` instead (:issue:`27586`). +- The ``numpy`` argument of :meth:`pandas.read_json` is deprecated (:issue:`28512`). - :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_feather`, and :meth:`DataFrame.to_parquet` argument "fname" is deprecated, use "path" instead (:issue:`23574`) - The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f5008f0c311ad..d85d75453dbde 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -11,6 +11,7 @@ from pandas._libs.tslibs import iNaT from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ensure_str, is_period_dtype @@ -346,6 +347,7 @@ def _write( return serialized +@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) def read_json( path_or_buf=None, orient=None, @@ -459,6 +461,8 @@ def read_json( non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. + .. deprecated:: 1.0.0 + precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 532d5215be902..aec66b9f06518 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -3,6 +3,7 @@ from io import StringIO import json import os +from warnings import catch_warnings, filterwarnings import numpy as np import pytest @@ -1606,3 +1607,13 @@ def test_emca_262_nan_inf_support(self): ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") + def test_deprecate_numpy_argument_read_json(self): + # https://github.com/pandas-dev/pandas/issues/28512 + expected = DataFrame([1, 2, 3]) + with tm.assert_produces_warning(None): + with catch_warnings(): + filterwarnings("ignore", category=FutureWarning) + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) From 42a46d79ccc2fff2f38428fb8eee9309ddecaeb2 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 4 Jan 2020 16:46:26 +0000 Subject: [PATCH 02/31] Fix test failures ignore FutureWarning --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 96af78c77feb8..5bab4ae8e4806 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,6 +66,7 @@ xfail_strict = True filterwarnings = error:Sparse:FutureWarning error:The SparseArray:FutureWarning + ignore: the 'numpy' keyword is deprecated:FutureWarning [coverage:run] branch = False From 8331d065c270308893d2ca79c71d47b659f2d400 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 4 Jan 2020 16:47:12 +0000 Subject: [PATCH 03/31] Filter warning correctly --- pandas/tests/io/json/test_pandas.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index aec66b9f06518..72f00df2e7b55 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1608,12 +1608,10 @@ def test_emca_262_nan_inf_support(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") + @pytest.mark.filterwarnings("ignore:the 'numpy' keyword:FutureWarning") def test_deprecate_numpy_argument_read_json(self): - # https://github.com/pandas-dev/pandas/issues/28512 + # GH 28512 expected = DataFrame([1, 2, 3]) - with tm.assert_produces_warning(None): - with catch_warnings(): - filterwarnings("ignore", category=FutureWarning) - result = read_json(expected.to_json(), numpy=True) - tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) From 3ba4169d370a6a152b6d4f7a3ac0fa1d00609a86 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 4 Jan 2020 17:18:39 +0000 Subject: [PATCH 04/31] Fix imports --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 72f00df2e7b55..00394e626b408 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -3,7 +3,6 @@ from io import StringIO import json import os -from warnings import catch_warnings, filterwarnings import numpy as np import pytest From 5068771a1c68bc6bb862422906620415fc05a234 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 4 Jan 2020 19:10:45 +0000 Subject: [PATCH 05/31] Add warning annotation --- pandas/tests/io/json/test_pandas.py | 1 + setup.cfg | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8e8e361dea891..6076ee3c13d9f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -39,6 +39,7 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(scope="function", autouse=True) def setup(self, datapath): diff --git a/setup.cfg b/setup.cfg index 5b93864ed92e8..f813d1296b047 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,7 +66,6 @@ xfail_strict = True filterwarnings = error:Sparse:FutureWarning error:The SparseArray:FutureWarning - ignore: the 'numpy' keyword is deprecated:FutureWarning [coverage:run] branch = False From 8d65aa7e163f6dddebab2ef021a491ae0b3b8d24 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 4 Jan 2020 19:12:32 +0000 Subject: [PATCH 06/31] Remove unrequired annotation --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6076ee3c13d9f..e909a4952948c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1608,7 +1608,6 @@ def test_emca_262_nan_inf_support(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.filterwarnings("ignore:the 'numpy' keyword:FutureWarning") def test_deprecate_numpy_argument_read_json(self): # GH 28512 expected = DataFrame([1, 2, 3]) From cb74fe351d0881cea36033551ccace4826b764f5 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 5 Jan 2020 01:11:15 +0000 Subject: [PATCH 07/31] Update docs --- doc/source/user_guide/io.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 82e01b62efbb9..9f99f36b6007d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2066,6 +2066,8 @@ The Numpy parameter +++++++++++++++++++ .. note:: + This param has been deprecated as of version 1.0.0 and will raise a ``FutureWarning``. + This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc. If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff @@ -2088,6 +2090,7 @@ data: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2102,6 +2105,7 @@ The speedup is less noticeable for smaller datasets: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) From b29404e4316504325939f8f15e7fce09f6ce3ccc Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 16 Jan 2020 00:34:08 +0000 Subject: [PATCH 08/31] Create deepsource.toml --- deepsource.toml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 deepsource.toml diff --git a/deepsource.toml b/deepsource.toml new file mode 100644 index 0000000000000..25bc3d76d21fe --- /dev/null +++ b/deepsource.toml @@ -0,0 +1,8 @@ +version = 1 + +[[analyzers]] +name = "python" +enabled = true + + [analyzers.meta] + runtime_version = "3.x.x" From f983f4f3de60a147a50a64a6100066b1ef37aef3 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 16 Sep 2020 23:05:14 +0100 Subject: [PATCH 09/31] Commit Complex handling --- pandas/core/algorithms.py | 9 +-- pandas/tests/test_complex.py | 125 +++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 7 deletions(-) create mode 100644 pandas/tests/test_complex.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f297c7165208f..4d1592fc07fc3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -117,13 +117,8 @@ def _ensure_data( elif is_float_dtype(values) or is_float_dtype(dtype): return ensure_float64(values), np.dtype("float64") elif is_complex_dtype(values) or is_complex_dtype(dtype): - - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(): - simplefilter("ignore", np.ComplexWarning) - values = ensure_float64(values) - return values, np.dtype("float64") + # Complex dtype is not supported coerce to object + return ensure_object(values), np.dtype("complex64") except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py new file mode 100644 index 0000000000000..7618025489d58 --- /dev/null +++ b/pandas/tests/test_complex.py @@ -0,0 +1,125 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas.util.testing as tm + + +class TestBasicComplexSupport: + @pytest.mark.parametrize( + "array,expected", + [ ( + [1 + 1j, 0, 1, 1j, 1 + 2j], + Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0]), + ), + ( + [1 + 2j, 0, 1j, 1, 1j, 1 + 1j], + # index is sorted by value counts in descending order by default + Series([2, 1, 1, 1, 1], index=[1j, 1 + 2j, 1 + 1j, 1, 0]), + ), + ], + ) + def test_value_counts(self, array, expected): + result = pd.value_counts(array) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected", + [ + ( + [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j], + np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)]), + ) + ], + ) + def test_unique(self, array, expected): + result = pd.unique(array) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, False, True, False, False, False, True], dtype=bool), + ) + ], + ) + def test_duplicated(self, array, expected): + result = Series(array, dtype=np.complex64).duplicated() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, True, True, False, True, True, True], dtype=bool), + ) + ], + ) + def test_isin(self, array, expected): + result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) + tm.assert_series_equal(result, expected) + + def test_factorize(self): + array = [1, 2, 2 + 1j] + labels, uniques = pd.factorize(array) + + expected_labels = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(labels, expected_labels) + + expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=np.complex64) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + @pytest.mark.parametrize( + "frame,expected", + [ + ( + DataFrame([{"a": 1, "b": 1 + 1j}, {"a": 1, "b": 1 + 2j}]), + DataFrame( + np.array([1, 1], dtype=np.int64), + index=Index([(1 + 1j), (1 + 2j)], dtype="object", name="b"), + columns=Index(["a"], dtype="object"), + ), + ) + ], + ) + def test_groupby(self, frame, expected): + result = frame.groupby("b", sort=False).count() + tm.assert_frame_equal(result, expected) + + # sorting of the index should fail since complex numbers are unordered + with pytest.raises(TypeError, match="'<' not supported between instances of 'complex' and 'complex'"): + frame.groupby("b", sort=True).count() + + @pytest.mark.parametrize( + "array,expected", + [ + ([0, 1j, 1, 1, 1 + 1j, 1 + 2j], Series([1], dtype=np.complex128)), + ([1 + 1j, 2j, 1 + 1j], Series([1 + 1j], dtype=np.complex128)), + ], + ) + def test_unimode(self, array, expected): + result = Series(array).mode() + tm.assert_series_equal(result, expected) + + # mode tries to sort multimodal series. + # A warning will be raised since complex numbers + # are not ordered. + @pytest.mark.parametrize( + "array,expected", + [ + ( + # no modes + [0, 1j, 1, 1 + 1j, 1 + 2j], + Series([0, 1, 1j, 1 + 1j, 1 + 2j], dtype=np.complex128), + ), + ([1 + 1j, 2j, 1 + 1j, 2j, 3], Series([1 + 1j, 2j], dtype=np.complex128)), + ], + ) + def test_multimode(self, array, expected): + with tm.assert_produces_warning(UserWarning): + result = Series(array).mode() + tm.assert_series_equal(result, expected) From c2e4e821d167a034583f6a8d04bd819b7c3d5aca Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 19 Sep 2020 19:43:02 +0100 Subject: [PATCH 10/31] run black --- pandas/tests/test_complex.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py index 7618025489d58..1f0d9d0cd7b4f 100644 --- a/pandas/tests/test_complex.py +++ b/pandas/tests/test_complex.py @@ -9,7 +9,8 @@ class TestBasicComplexSupport: @pytest.mark.parametrize( "array,expected", - [ ( + [ + ( [1 + 1j, 0, 1, 1j, 1 + 2j], Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0]), ), @@ -91,7 +92,10 @@ def test_groupby(self, frame, expected): tm.assert_frame_equal(result, expected) # sorting of the index should fail since complex numbers are unordered - with pytest.raises(TypeError, match="'<' not supported between instances of 'complex' and 'complex'"): + with pytest.raises( + TypeError, + match="'<' not supported between instances of 'complex' and 'complex'", + ): frame.groupby("b", sort=True).count() @pytest.mark.parametrize( From 7c424957b6caa7ef61f2784eae67aefe03c9b157 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 19 Sep 2020 19:52:04 +0100 Subject: [PATCH 11/31] Use pandas.testing --- pandas/tests/test_complex.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py index 1f0d9d0cd7b4f..a16d4db50c072 100644 --- a/pandas/tests/test_complex.py +++ b/pandas/tests/test_complex.py @@ -3,8 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series -import pandas.util.testing as tm - +import pandas.testing as tm class TestBasicComplexSupport: @pytest.mark.parametrize( @@ -36,7 +35,7 @@ def test_value_counts(self, array, expected): ) def test_unique(self, array, expected): result = pd.unique(array) - tm.assert_numpy_array_equal(result, expected) + np.testing.assert_array_equal(result, expected) @pytest.mark.parametrize( "array,expected", @@ -69,10 +68,10 @@ def test_factorize(self): labels, uniques = pd.factorize(array) expected_labels = np.array([0, 1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(labels, expected_labels) + np.testing.assert_array_equal(labels, expected_labels) expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=np.complex64) - tm.assert_numpy_array_equal(uniques, expected_uniques) + np.testing.assert_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( "frame,expected", @@ -124,6 +123,6 @@ def test_unimode(self, array, expected): ], ) def test_multimode(self, array, expected): - with tm.assert_produces_warning(UserWarning): + with pytest.warns(UserWarning): result = Series(array).mode() tm.assert_series_equal(result, expected) From 41b1faf4418c59b181dcbeae7fb2f2c824e8fb63 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 19 Sep 2020 19:59:32 +0100 Subject: [PATCH 12/31] Use pandas.testing --- pandas/tests/test_complex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py index a16d4db50c072..0861e2a2412b7 100644 --- a/pandas/tests/test_complex.py +++ b/pandas/tests/test_complex.py @@ -5,6 +5,7 @@ from pandas import DataFrame, Index, Series import pandas.testing as tm + class TestBasicComplexSupport: @pytest.mark.parametrize( "array,expected", From da53f38584679662f4e135a9a1e69bf43aa2df73 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 19 Sep 2020 20:32:06 +0100 Subject: [PATCH 13/31] Clean ups --- pandas/core/algorithms.py | 2 +- pandas/tests/test_complex.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4d1592fc07fc3..722a2e338f0fd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -7,7 +7,7 @@ import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast -from warnings import catch_warnings, simplefilter, warn +from warnings import warn import numpy as np diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py index 0861e2a2412b7..a5fee4a532e8a 100644 --- a/pandas/tests/test_complex.py +++ b/pandas/tests/test_complex.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series -import pandas.testing as tm +import pandas._testing as tm class TestBasicComplexSupport: @@ -36,7 +36,7 @@ def test_value_counts(self, array, expected): ) def test_unique(self, array, expected): result = pd.unique(array) - np.testing.assert_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( "array,expected", @@ -69,10 +69,10 @@ def test_factorize(self): labels, uniques = pd.factorize(array) expected_labels = np.array([0, 1, 2], dtype=np.intp) - np.testing.assert_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(labels, expected_labels) expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=np.complex64) - np.testing.assert_array_equal(uniques, expected_uniques) + tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( "frame,expected", @@ -124,6 +124,6 @@ def test_unimode(self, array, expected): ], ) def test_multimode(self, array, expected): - with pytest.warns(UserWarning): + with tm.assert_produces_warning(UserWarning): result = Series(array).mode() tm.assert_series_equal(result, expected) From f4932d9c644a3d21fd3c7418b697bee1e34040e8 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 26 Nov 2020 20:25:15 +0000 Subject: [PATCH 14/31] Move test to sep files --- pandas/tests/groupby/test_groupby.py | 25 ++++ pandas/tests/indexes/multi/test_duplicates.py | 16 ++- pandas/tests/indexes/period/test_factorize.py | 12 +- pandas/tests/reductions/test_reductions.py | 11 ++ pandas/tests/series/methods/test_isin.py | 14 ++ .../tests/series/methods/test_value_counts.py | 18 +++ pandas/tests/test_algos.py | 13 ++ pandas/tests/test_complex.py | 129 ------------------ 8 files changed, 107 insertions(+), 131 deletions(-) delete mode 100644 pandas/tests/test_complex.py diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cd1fc67772849..184ca24519341 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2156,3 +2156,28 @@ def test_groupby_series_with_tuple_name(): expected = Series([2, 4], index=[1, 2], name=("a", "a")) expected.index.name = ("b", "b") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "frame,expected", + [ + ( + DataFrame([{"a": 1, "b": 1 + 1j}, {"a": 1, "b": 1 + 2j}]), + DataFrame( + np.array([1, 1], dtype=np.int64), + index=Index([(1 + 1j), (1 + 2j)], dtype="object", name="b"), + columns=Index(["a"], dtype="object"), + ), + ) + ], +) +def test_groupby(frame, expected): + result = frame.groupby("b", sort=False).count() + tm.assert_frame_equal(result, expected) + + # sorting of the index should fail since complex numbers are unordered + with pytest.raises( + TypeError, + match="'<' not supported between instances of 'complex' and 'complex'", + ): + frame.groupby("b", sort=True).count() \ No newline at end of file diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index aa2f37dad152c..f1e44625f4b22 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -5,7 +5,7 @@ from pandas._libs import hashtable -from pandas import DatetimeIndex, MultiIndex +from pandas import DatetimeIndex, MultiIndex, Series import pandas._testing as tm @@ -303,3 +303,17 @@ def test_duplicated_drop_duplicates(): assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) + + +@pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, False, True, False, False, False, True], dtype=bool), + ) + ], + ) +def test_duplicated_series_complex_numbers(array, expected): + result = Series(array, dtype=np.complex64).duplicated() + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/indexes/period/test_factorize.py b/pandas/tests/indexes/period/test_factorize.py index 7c9367a1011a2..a6980156ae19b 100644 --- a/pandas/tests/indexes/period/test_factorize.py +++ b/pandas/tests/indexes/period/test_factorize.py @@ -1,6 +1,6 @@ import numpy as np -from pandas import PeriodIndex +from pandas import PeriodIndex, factorize import pandas._testing as tm @@ -35,3 +35,13 @@ def test_factorize(self): arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + + def test_factorize_complex(self): + array = [1, 2, 2 + 1j] + labels, uniques = factorize(array) + + expected_labels = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(labels, expected_labels) + + expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=np.complex64) + tm.assert_numpy_array_equal(uniques, expected_uniques) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 1e84ba1dbffd9..42050aa29a4a0 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1342,3 +1342,14 @@ def test_mode_sortwarning(self): result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected", + [ + ([0, 1j, 1, 1, 1 + 1j, 1 + 2j], Series([1], dtype=np.complex128)), + ([1 + 1j, 2j, 1 + 1j], Series([1 + 1j], dtype=np.complex128)), + ], + ) + def test_unimode(self, array, expected): + result = Series(array).mode() + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 86ea2b2f02a4d..6b629f195f67a 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -99,3 +99,17 @@ def test_isin_large_series_mixed_dtypes_and_nan(): result = ser.isin({"foo", "bar"}) expected = Series([False] * 3 * 1_000_000) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, True, True, False, True, True, True], dtype=bool), + ) + ], + ) +def test_isin_complex_numbers(self, array, expected): + result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f22b1be672190..7061fcac3a1f8 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -203,3 +203,21 @@ def test_value_counts_bool_with_nan(self, ser, dropna, exp): # GH32146 out = ser.value_counts(dropna=dropna) tm.assert_series_equal(out, exp) + + @pytest.mark.parametrize( + "input_array,expected", + [ + ( + [1 + 1j, 0, 1, 1j, 1 + 2j], + Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0]), + ), + ( + [1 + 2j, 0, 1j, 1, 1j, 1 + 1j], + # index is sorted by value counts in descending order by default + Series([2, 1, 1, 1, 1], index=[1j, 1 + 2j, 1 + 1j, 1, 0]), + ), + ], + ) + def test_value_counts_complex_numbers(self, input_array, expected): + result = pd.value_counts(input_array) + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 34b7d0e73e914..7d33732e7307e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1448,6 +1448,19 @@ def test_unique_tuples(self, arr, unique): result = pd.unique(arr) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "array,expected", + [ + ( + [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j], + np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)]), + ) + ], + ) + def test_unique_complex_numbers(self, array, expected): + result = pd.unique(array) + tm.assert_numpy_array_equal(result, expected) + class TestHashTable: def test_string_hashtable_set_item_signature(self): diff --git a/pandas/tests/test_complex.py b/pandas/tests/test_complex.py deleted file mode 100644 index a5fee4a532e8a..0000000000000 --- a/pandas/tests/test_complex.py +++ /dev/null @@ -1,129 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import DataFrame, Index, Series -import pandas._testing as tm - - -class TestBasicComplexSupport: - @pytest.mark.parametrize( - "array,expected", - [ - ( - [1 + 1j, 0, 1, 1j, 1 + 2j], - Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0]), - ), - ( - [1 + 2j, 0, 1j, 1, 1j, 1 + 1j], - # index is sorted by value counts in descending order by default - Series([2, 1, 1, 1, 1], index=[1j, 1 + 2j, 1 + 1j, 1, 0]), - ), - ], - ) - def test_value_counts(self, array, expected): - result = pd.value_counts(array) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "array,expected", - [ - ( - [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j], - np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)]), - ) - ], - ) - def test_unique(self, array, expected): - result = pd.unique(array) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize( - "array,expected", - [ - ( - [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], - Series([False, False, True, False, False, False, True], dtype=bool), - ) - ], - ) - def test_duplicated(self, array, expected): - result = Series(array, dtype=np.complex64).duplicated() - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "array,expected", - [ - ( - [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], - Series([False, True, True, False, True, True, True], dtype=bool), - ) - ], - ) - def test_isin(self, array, expected): - result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) - tm.assert_series_equal(result, expected) - - def test_factorize(self): - array = [1, 2, 2 + 1j] - labels, uniques = pd.factorize(array) - - expected_labels = np.array([0, 1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(labels, expected_labels) - - expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=np.complex64) - tm.assert_numpy_array_equal(uniques, expected_uniques) - - @pytest.mark.parametrize( - "frame,expected", - [ - ( - DataFrame([{"a": 1, "b": 1 + 1j}, {"a": 1, "b": 1 + 2j}]), - DataFrame( - np.array([1, 1], dtype=np.int64), - index=Index([(1 + 1j), (1 + 2j)], dtype="object", name="b"), - columns=Index(["a"], dtype="object"), - ), - ) - ], - ) - def test_groupby(self, frame, expected): - result = frame.groupby("b", sort=False).count() - tm.assert_frame_equal(result, expected) - - # sorting of the index should fail since complex numbers are unordered - with pytest.raises( - TypeError, - match="'<' not supported between instances of 'complex' and 'complex'", - ): - frame.groupby("b", sort=True).count() - - @pytest.mark.parametrize( - "array,expected", - [ - ([0, 1j, 1, 1, 1 + 1j, 1 + 2j], Series([1], dtype=np.complex128)), - ([1 + 1j, 2j, 1 + 1j], Series([1 + 1j], dtype=np.complex128)), - ], - ) - def test_unimode(self, array, expected): - result = Series(array).mode() - tm.assert_series_equal(result, expected) - - # mode tries to sort multimodal series. - # A warning will be raised since complex numbers - # are not ordered. - @pytest.mark.parametrize( - "array,expected", - [ - ( - # no modes - [0, 1j, 1, 1 + 1j, 1 + 2j], - Series([0, 1, 1j, 1 + 1j, 1 + 2j], dtype=np.complex128), - ), - ([1 + 1j, 2j, 1 + 1j, 2j, 3], Series([1 + 1j, 2j], dtype=np.complex128)), - ], - ) - def test_multimode(self, array, expected): - with tm.assert_produces_warning(UserWarning): - result = Series(array).mode() - tm.assert_series_equal(result, expected) From 328e242e9f443b2633751c815b1126b9f0ba06e9 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 28 Nov 2020 00:42:34 +0000 Subject: [PATCH 15/31] Refactor Tests --- .gitignore | 1 + .pre-commit-config.yaml | 2 +- Dockerfile | 2 +- Makefile | 2 +- README.md | 2 +- asv_bench/benchmarks/algorithms.py | 12 + asv_bench/benchmarks/categoricals.py | 43 + asv_bench/benchmarks/groupby.py | 2 +- asv_bench/benchmarks/join_merge.py | 6 + asv_bench/benchmarks/reshape.py | 5 +- asv_bench/benchmarks/rolling.py | 13 + asv_bench/benchmarks/series_methods.py | 73 +- azure-pipelines.yml | 2 +- ci/azure/windows.yml | 2 +- ci/code_checks.sh | 2 +- ci/deps/azure-39.yaml | 5 + ci/deps/travis-37-locale.yaml | 2 +- ci/run_tests.sh | 2 +- ci/setup_env.sh | 8 +- doc/source/development/contributing.rst | 51 +- doc/source/development/policies.rst | 2 +- doc/source/ecosystem.rst | 10 + doc/source/getting_started/install.rst | 2 +- doc/source/reference/index.rst | 1 - doc/source/reference/panel.rst | 10 - doc/source/reference/style.rst | 1 + doc/source/user_guide/computation.rst | 7 + doc/source/user_guide/dsintro.rst | 2 +- doc/source/user_guide/groupby.rst | 9 + doc/source/user_guide/indexing.rst | 70 +- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/merging.rst | 9 +- doc/source/user_guide/style.ipynb | 34 +- doc/source/user_guide/timeseries.rst | 21 +- doc/source/user_guide/window.rst | 2 +- doc/source/whatsnew/v0.12.0.rst | 6 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.15.2.rst | 2 +- doc/source/whatsnew/v0.16.1.rst | 4 +- doc/source/whatsnew/v0.16.2.rst | 2 +- doc/source/whatsnew/v0.18.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 8 +- doc/source/whatsnew/v0.21.0.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 4 +- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.8.0.rst | 2 +- doc/source/whatsnew/v1.1.5.rst | 22 +- doc/source/whatsnew/v1.2.0.rst | 380 +++++--- environment.yml | 3 + pandas/__init__.py | 19 +- pandas/_libs/groupby.pyx | 26 +- pandas/_libs/hashtable.pxd | 56 ++ pandas/_libs/hashtable.pyx | 44 +- pandas/_libs/hashtable_class_helper.pxi.in | 98 +- pandas/_libs/hashtable_func_helper.pxi.in | 18 +- pandas/_libs/index_class_helper.pxi.in | 30 +- pandas/_libs/interval.pyx | 3 +- pandas/_libs/khash.pxd | 83 +- .../_libs/khash_for_primitive_helper.pxi.in | 42 + pandas/_libs/lib.pyx | 14 +- pandas/_libs/reduction.pyx | 4 +- pandas/_libs/src/klib/khash.h | 103 ++- pandas/_libs/src/klib/khash_python.h | 90 +- pandas/_libs/tslibs/offsets.pyx | 26 + pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/tzconversion.pyx | 6 +- pandas/_libs/window/aggregations.pyx | 31 +- pandas/_testing.py | 39 +- pandas/_version.py | 301 +++--- pandas/compat/_optional.py | 2 +- pandas/conftest.py | 48 +- pandas/core/algorithms.py | 107 ++- pandas/core/apply.py | 33 +- pandas/core/arraylike.py | 144 ++- pandas/core/arrays/_mixins.py | 48 +- pandas/core/arrays/base.py | 26 +- pandas/core/arrays/categorical.py | 60 +- pandas/core/arrays/datetimelike.py | 66 +- pandas/core/arrays/datetimes.py | 12 +- pandas/core/arrays/floating.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/interval.py | 293 +++--- pandas/core/arrays/masked.py | 10 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/period.py | 1 + pandas/core/arrays/string_.py | 4 - pandas/core/arrays/string_arrow.py | 625 +++++++++++++ pandas/core/arrays/timedeltas.py | 4 +- pandas/core/base.py | 13 +- pandas/core/common.py | 41 +- pandas/core/computation/align.py | 14 +- pandas/core/computation/parsing.py | 8 +- pandas/core/computation/pytables.py | 4 + pandas/core/construction.py | 6 +- pandas/core/dtypes/base.py | 5 +- pandas/core/dtypes/cast.py | 10 +- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/concat.py | 6 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/generic.py | 22 +- pandas/core/frame.py | 194 ++-- pandas/core/generic.py | 181 ++-- pandas/core/groupby/base.py | 1 + pandas/core/groupby/generic.py | 5 +- pandas/core/groupby/groupby.py | 28 +- pandas/core/groupby/ops.py | 4 +- pandas/core/indexers.py | 2 +- pandas/core/indexes/base.py | 142 ++- pandas/core/indexes/category.py | 130 +-- pandas/core/indexes/datetimelike.py | 230 +++-- pandas/core/indexes/datetimes.py | 52 +- pandas/core/indexes/extension.py | 56 +- pandas/core/indexes/interval.py | 164 ++-- pandas/core/indexes/multi.py | 265 ++++-- pandas/core/indexes/numeric.py | 97 +- pandas/core/indexes/period.py | 82 +- pandas/core/indexes/range.py | 24 +- pandas/core/indexes/timedeltas.py | 31 +- pandas/core/indexing.py | 192 ++-- pandas/core/internals/blocks.py | 140 +-- pandas/core/internals/concat.py | 1 + pandas/core/internals/construction.py | 5 +- pandas/core/internals/managers.py | 40 +- pandas/core/nanops.py | 2 +- pandas/core/ops/array_ops.py | 8 +- pandas/core/resample.py | 4 +- pandas/core/reshape/concat.py | 28 +- pandas/core/reshape/merge.py | 74 +- pandas/core/reshape/reshape.py | 12 +- pandas/core/series.py | 92 +- pandas/core/shared_docs.py | 63 ++ pandas/core/sorting.py | 2 +- pandas/core/strings/accessor.py | 3 +- pandas/core/tools/numeric.py | 6 +- pandas/core/tools/timedeltas.py | 5 + pandas/core/window/__init__.py | 5 +- pandas/core/window/common.py | 4 + pandas/core/window/ewm.py | 156 +++- pandas/core/window/indexers.py | 15 + pandas/core/window/numba_.py | 89 ++ pandas/core/window/rolling.py | 46 +- pandas/io/common.py | 5 +- pandas/io/excel/_base.py | 57 +- pandas/io/excel/_odswriter.py | 2 +- pandas/io/feather_format.py | 20 +- pandas/io/formats/console.py | 2 +- pandas/io/formats/csvs.py | 2 +- pandas/io/formats/excel.py | 167 ++-- pandas/io/formats/format.py | 45 +- pandas/io/formats/info.py | 489 +++++----- pandas/io/formats/printing.py | 2 +- pandas/io/formats/style.py | 90 +- pandas/io/json/_json.py | 58 +- pandas/io/parquet.py | 162 ++-- pandas/io/parsers.py | 45 +- pandas/io/pickle.py | 29 +- pandas/io/pytables.py | 5 +- pandas/io/sas/sas7bdat.py | 4 +- pandas/io/sas/sas_xport.py | 5 - pandas/io/sql.py | 2 +- pandas/io/stata.py | 94 +- pandas/plotting/_matplotlib/boxplot.py | 4 +- pandas/plotting/_matplotlib/converter.py | 2 +- pandas/plotting/_matplotlib/core.py | 31 +- pandas/plotting/_matplotlib/tools.py | 10 +- pandas/tests/arithmetic/conftest.py | 11 +- pandas/tests/arithmetic/test_datetime64.py | 27 +- pandas/tests/arithmetic/test_interval.py | 2 +- pandas/tests/arithmetic/test_numeric.py | 102 ++- pandas/tests/arithmetic/test_period.py | 128 ++- pandas/tests/arithmetic/test_timedelta64.py | 2 +- .../arrays/categorical/test_analytics.py | 6 +- pandas/tests/arrays/categorical/test_api.py | 5 +- .../arrays/categorical/test_constructors.py | 12 +- .../tests/arrays/categorical/test_dtypes.py | 4 +- .../arrays/floating/test_construction.py | 2 +- .../tests/arrays/integer/test_construction.py | 2 +- pandas/tests/arrays/interval/test_astype.py | 23 + pandas/tests/arrays/sparse/test_array.py | 2 +- pandas/tests/arrays/sparse/test_dtype.py | 4 +- pandas/tests/arrays/string_/test_string.py | 383 ++++++-- .../tests/arrays/string_/test_string_arrow.py | 26 + pandas/tests/arrays/test_datetimelike.py | 65 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/base/test_misc.py | 2 +- pandas/tests/dtypes/test_generic.py | 1 - pandas/tests/dtypes/test_inference.py | 19 +- pandas/tests/extension/test_external_block.py | 2 +- pandas/tests/extension/test_interval.py | 6 +- pandas/tests/extension/test_sparse.py | 2 +- pandas/tests/extension/test_string.py | 58 +- pandas/tests/frame/apply/test_frame_apply.py | 60 +- pandas/tests/frame/conftest.py | 5 + pandas/tests/frame/indexing/test_getitem.py | 23 + pandas/tests/frame/indexing/test_indexing.py | 23 +- pandas/tests/frame/indexing/test_setitem.py | 34 +- pandas/tests/frame/indexing/test_xs.py | 8 + pandas/tests/frame/methods/test_describe.py | 2 +- pandas/tests/frame/methods/test_dtypes.py | 18 +- pandas/tests/frame/methods/test_fillna.py | 15 + pandas/tests/frame/methods/test_reindex.py | 19 +- pandas/tests/frame/methods/test_replace.py | 10 +- .../tests/frame/methods/test_reset_index.py | 2 +- .../tests/frame/methods/test_select_dtypes.py | 26 +- pandas/tests/frame/methods/test_to_dict.py | 4 +- pandas/tests/frame/test_constructors.py | 74 +- pandas/tests/frame/test_reductions.py | 119 ++- pandas/tests/frame/test_repr_info.py | 8 + pandas/tests/frame/test_stack_unstack.py | 26 + pandas/tests/frame/test_ufunc.py | 111 +++ pandas/tests/generic/test_duplicate_labels.py | 8 +- pandas/tests/generic/test_finalize.py | 31 +- .../tests/groupby/aggregate/test_aggregate.py | 23 +- pandas/tests/groupby/test_allowlist.py | 1 + pandas/tests/groupby/test_categorical.py | 54 +- pandas/tests/groupby/test_groupby.py | 49 +- pandas/tests/groupby/test_missing.py | 10 + pandas/tests/groupby/test_timegrouper.py | 58 +- .../tests/indexes/base_class/test_formats.py | 134 +++ .../tests/indexes/base_class/test_setops.py | 110 ++- .../indexes/categorical/test_category.py | 247 ++--- .../tests/indexes/categorical/test_equals.py | 77 ++ .../tests/indexes/categorical/test_formats.py | 26 +- .../indexes/categorical/test_indexing.py | 54 +- pandas/tests/indexes/categorical/test_map.py | 12 +- pandas/tests/indexes/common.py | 3 + pandas/tests/indexes/conftest.py | 2 +- pandas/tests/indexes/datetimelike.py | 35 +- pandas/tests/indexes/datetimes/test_astype.py | 6 +- .../indexes/datetimes/test_constructors.py | 24 +- .../indexes/datetimes/test_date_range.py | 28 +- .../tests/indexes/datetimes/test_datetime.py | 26 +- .../tests/indexes/datetimes/test_indexing.py | 58 +- pandas/tests/indexes/datetimes/test_misc.py | 30 +- pandas/tests/indexes/datetimes/test_ops.py | 71 +- .../indexes/datetimes/test_partial_slicing.py | 13 +- pandas/tests/indexes/datetimes/test_setops.py | 24 +- pandas/tests/indexes/datetimes/test_shift.py | 4 +- .../tests/indexes/datetimes/test_timezones.py | 16 +- pandas/tests/indexes/interval/test_astype.py | 16 +- pandas/tests/indexes/interval/test_base.py | 54 +- .../indexes/interval/test_constructors.py | 20 +- pandas/tests/indexes/interval/test_equals.py | 33 + .../tests/indexes/interval/test_interval.py | 19 +- pandas/tests/indexes/interval/test_setops.py | 8 +- .../tests/indexes/multi/test_constructors.py | 22 +- pandas/tests/indexes/multi/test_drop.py | 29 + pandas/tests/indexes/multi/test_duplicates.py | 18 +- pandas/tests/indexes/multi/test_indexing.py | 12 +- pandas/tests/indexes/multi/test_sorting.py | 10 +- pandas/tests/indexes/numeric/test_setops.py | 139 +++ pandas/tests/indexes/period/test_astype.py | 12 +- pandas/tests/indexes/period/test_factorize.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 6 +- pandas/tests/indexes/period/test_ops.py | 40 +- .../indexes/period/test_partial_slicing.py | 30 +- .../tests/indexes/ranges/test_constructors.py | 14 +- pandas/tests/indexes/ranges/test_indexing.py | 2 +- pandas/tests/indexes/ranges/test_range.py | 25 - pandas/tests/indexes/ranges/test_setops.py | 25 +- pandas/tests/indexes/test_any_index.py | 14 + pandas/tests/indexes/test_base.py | 339 +------ pandas/tests/indexes/test_common.py | 23 +- pandas/tests/indexes/test_datetimelike.py | 174 ++++ pandas/tests/indexes/test_numeric.py | 151 +--- pandas/tests/indexes/test_setops.py | 8 +- .../tests/indexes/timedeltas/test_astype.py | 4 +- .../indexes/timedeltas/test_constructors.py | 4 +- .../tests/indexes/timedeltas/test_indexing.py | 2 +- pandas/tests/indexes/timedeltas/test_ops.py | 43 +- .../indexes/timedeltas/test_scalar_compat.py | 3 +- .../tests/indexes/timedeltas/test_setops.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 6 - pandas/tests/indexing/common.py | 2 +- .../tests/indexing/interval/test_interval.py | 10 +- .../indexing/interval/test_interval_new.py | 12 +- pandas/tests/indexing/multiindex/test_loc.py | 55 ++ .../tests/indexing/multiindex/test_partial.py | 48 +- .../tests/indexing/multiindex/test_setitem.py | 11 +- .../tests/indexing/multiindex/test_slice.py | 79 +- pandas/tests/indexing/test_at.py | 29 +- pandas/tests/indexing/test_categorical.py | 100 +- .../indexing/test_chaining_and_caching.py | 18 +- pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_datetime.py | 47 +- pandas/tests/indexing/test_floats.py | 121 +-- pandas/tests/indexing/test_iat.py | 15 +- pandas/tests/indexing/test_iloc.py | 86 +- pandas/tests/indexing/test_indexing.py | 143 ++- pandas/tests/indexing/test_loc.py | 114 ++- pandas/tests/indexing/test_partial.py | 10 +- pandas/tests/indexing/test_scalar.py | 32 +- pandas/tests/internals/test_internals.py | 18 +- pandas/tests/io/conftest.py | 2 +- pandas/tests/io/excel/test_writers.py | 15 +- pandas/tests/io/excel/test_xlrd.py | 2 +- .../data/html/various_dtypes_formatted.html | 36 + pandas/tests/io/formats/test_format.py | 18 +- pandas/tests/io/formats/test_info.py | 119 ++- pandas/tests/io/formats/test_style.py | 22 + pandas/tests/io/formats/test_to_html.py | 15 + pandas/tests/io/json/test_pandas.py | 14 +- pandas/tests/io/parser/test_compression.py | 19 +- pandas/tests/io/parser/test_read_fwf.py | 47 +- pandas/tests/io/pytables/test_store.py | 20 +- pandas/tests/io/pytables/test_timezones.py | 30 +- pandas/tests/io/test_clipboard.py | 2 +- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_fsspec.py | 5 + pandas/tests/io/test_parquet.py | 69 +- pandas/tests/io/test_sql.py | 22 +- pandas/tests/libs/test_hashtable.py | 336 +++++++ pandas/tests/plotting/frame/test_frame.py | 86 ++ pandas/tests/plotting/test_converter.py | 23 +- pandas/tests/plotting/test_datetimelike.py | 44 +- pandas/tests/plotting/test_groupby.py | 4 +- pandas/tests/plotting/test_series.py | 23 + pandas/tests/reductions/test_reductions.py | 33 +- pandas/tests/resample/test_datetime_index.py | 31 +- pandas/tests/resample/test_period_index.py | 10 +- pandas/tests/resample/test_resample_api.py | 4 +- pandas/tests/reshape/concat/test_concat.py | 26 +- pandas/tests/reshape/concat/test_dataframe.py | 11 + pandas/tests/reshape/concat/test_series.py | 4 +- pandas/tests/reshape/merge/test_join.py | 14 +- pandas/tests/reshape/merge/test_merge.py | 20 +- .../tests/reshape/merge/test_merge_cross.py | 95 ++ pandas/tests/reshape/test_get_dummies.py | 2 +- pandas/tests/reshape/test_pivot.py | 13 +- pandas/tests/scalar/period/test_period.py | 4 +- .../tests/scalar/timestamp/test_timestamp.py | 33 +- .../series/accessors/test_cat_accessor.py | 5 +- pandas/tests/series/indexing/test_datetime.py | 3 +- pandas/tests/series/indexing/test_getitem.py | 22 + pandas/tests/series/indexing/test_setitem.py | 25 + pandas/tests/series/methods/test_isin.py | 75 +- pandas/tests/series/methods/test_replace.py | 9 +- pandas/tests/series/methods/test_to_csv.py | 2 +- pandas/tests/series/methods/test_to_frame.py | 4 +- .../tests/series/methods/test_value_counts.py | 7 +- pandas/tests/series/test_arithmetic.py | 36 +- pandas/tests/series/test_constructors.py | 55 +- pandas/tests/series/test_dtypes.py | 4 +- pandas/tests/series/test_reductions.py | 2 +- pandas/tests/series/test_validate.py | 2 +- pandas/tests/test_algos.py | 72 +- pandas/tests/test_downstream.py | 1 - pandas/tests/test_multilevel.py | 39 +- pandas/tests/test_sorting.py | 16 +- pandas/tests/tools/test_to_datetime.py | 31 + pandas/tests/tools/test_to_timedelta.py | 17 + pandas/tests/tslibs/test_array_to_datetime.py | 4 +- pandas/tests/tslibs/test_parsing.py | 4 +- pandas/tests/util/test_assert_almost_equal.py | 2 +- pandas/tests/util/test_hashing.py | 17 +- pandas/tests/window/common.py | 147 --- pandas/tests/window/conftest.py | 83 +- pandas/tests/window/moments/conftest.py | 77 -- .../moments/test_moments_consistency_ewm.py | 459 +++++----- .../test_moments_consistency_expanding.py | 424 +++++---- .../test_moments_consistency_rolling.py | 550 +++++------ .../tests/window/moments/test_moments_ewm.py | 12 +- .../window/moments/test_moments_rolling.py | 5 +- pandas/tests/window/test_api.py | 73 +- pandas/tests/window/test_apply.py | 11 - pandas/tests/window/test_ewm.py | 4 +- pandas/tests/window/test_expanding.py | 41 +- .../{test_grouper.py => test_groupby.py} | 303 ++++--- pandas/tests/window/test_numba.py | 38 +- pandas/tests/window/test_rolling.py | 173 ++-- pandas/tests/window/test_timeseries_window.py | 19 +- .../{test_window.py => test_win_type.py} | 57 +- pandas/util/_doctools.py | 20 +- ...check_for_inconsistent_pandas_namespace.py | 49 +- scripts/generate_pip_deps_from_conda.py | 5 +- scripts/validate_rst_title_capitalization.py | 1 - setup.cfg | 6 +- setup.py | 15 +- test.bat | 3 - versioneer.py | 854 +++++++++++------- web/pandas/community/ecosystem.md | 2 +- 381 files changed, 11196 insertions(+), 6484 deletions(-) delete mode 100644 doc/source/reference/panel.rst create mode 100644 pandas/_libs/khash_for_primitive_helper.pxi.in create mode 100644 pandas/core/arrays/string_arrow.py create mode 100644 pandas/tests/arrays/interval/test_astype.py create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py create mode 100644 pandas/tests/frame/test_ufunc.py create mode 100644 pandas/tests/indexes/base_class/test_formats.py create mode 100644 pandas/tests/indexes/categorical/test_equals.py create mode 100644 pandas/tests/indexes/interval/test_equals.py create mode 100644 pandas/tests/indexes/numeric/test_setops.py create mode 100644 pandas/tests/indexes/test_datetimelike.py create mode 100644 pandas/tests/io/formats/data/html/various_dtypes_formatted.html create mode 100644 pandas/tests/libs/test_hashtable.py create mode 100644 pandas/tests/reshape/merge/test_merge_cross.py delete mode 100644 pandas/tests/window/common.py delete mode 100644 pandas/tests/window/moments/conftest.py rename pandas/tests/window/{test_grouper.py => test_groupby.py} (77%) rename pandas/tests/window/{test_window.py => test_win_type.py} (57%) delete mode 100644 test.bat diff --git a/.gitignore b/.gitignore index 6c3c275c48fb7..1661862a5d066 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ *.log *.swp *.pdb +*.zip .project .pydevproject .settings diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9b396715664a..717334bfe1299 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: name: isort (cython) types: [cython] - repo: https://github.com/asottile/pyupgrade - rev: v2.7.3 + rev: v2.7.4 hooks: - id: pyupgrade args: [--py37-plus] diff --git a/Dockerfile b/Dockerfile index b8aff5d671dcf..5d7a2b9e6b743 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,5 +43,5 @@ RUN conda env update -n base -f "$pandas_home/environment.yml" # Build C extensions and pandas RUN cd "$pandas_home" \ - && python setup.py build_ext --inplace -j 4 \ + && python setup.py build_ext -j 4 \ && python -m pip install -e . diff --git a/Makefile b/Makefile index 4f71df51de360..2c968234749f5 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean_pyc: -find . -name '*.py[co]' -exec rm {} \; build: clean_pyc - python setup.py build_ext --inplace + python setup.py build_ext lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 diff --git a/README.md b/README.md index a2f2f1c04442a..4072faffe3b3a 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Here are just a few of the things that pandas does well: and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, - date shifting and lagging. + date shifting and lagging [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..03480ae198345 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,6 +5,7 @@ from pandas._libs import lib import pandas as pd +from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -174,4 +175,15 @@ def time_argsort(self, N): self.array.argsort() +class RemoveDuplicates: + def setup(self): + N = 10 ** 5 + na = np.arange(int(N / 2)) + self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) + self.right = np.concatenate([na, na]) + + def time_make_duplicates_of_left_unique_in_right(self): + make_duplicates_of_left_unique_in_right(self.left, self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0b24342091ec..f3b005b704014 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import string +import sys import warnings import numpy as np @@ -67,6 +69,47 @@ def time_existing_series(self): pd.Categorical(self.series) +class AsType: + def setup(self): + N = 10 ** 5 + + random_pick = np.random.default_rng().choice + + categories = { + "str": list(string.ascii_letters), + "int": np.random.randint(2 ** 16, size=154), + "float": sys.maxsize * np.random.random((38,)), + "timestamp": [ + pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) + ], + } + + self.df = pd.DataFrame( + {col: random_pick(cats, N) for col, cats in categories.items()} + ) + + for col in ("int", "float", "timestamp"): + self.df[col + "_as_str"] = self.df[col].astype(str) + + for col in self.df.columns: + self.df[col] = self.df[col].astype("category") + + def astype_str(self): + [self.df[col].astype("str") for col in "int float timestamp".split()] + + def astype_int(self): + [self.df[col].astype("int") for col in "int_as_str timestamp".split()] + + def astype_float(self): + [ + self.df[col].astype("float") + for col in "float_as_str int int_as_str timestamp".split() + ] + + def astype_datetime(self): + self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) + + class Concat: def setup(self): N = 10 ** 5 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 22f002e6cb79a..6ce63ff8badca 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -486,7 +486,7 @@ def setup(self): tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) tmp = np.concatenate((tmp1, tmp2)) arr = np.repeat(tmp, 10) - self.df = DataFrame(dict(a=arr, b=arr)) + self.df = DataFrame({"a": arr, "b": arr}) def time_sum(self): self.df.groupby(["a"])["b"].sum() diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..a572b8a70a680 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) + class JoinIndex: def setup(self): @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + class I8Merge: diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 21081ee23a773..9cec8a5f7d318 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -103,7 +103,10 @@ def setup(self): nidvars = 20 N = 5000 self.letters = list("ABCD") - yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] + yrvars = [ + letter + str(num) + for letter, num in product(self.letters, range(1, nyrs + 1)) + ] columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) self.df["id"] = self.df.index diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 226b225b47591..79a33c437ea5c 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -225,4 +225,17 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() +class GroupbyEWM: + + params = ["cython", "numba"] + param_names = ["engine"] + + def setup(self, engine): + df = pd.DataFrame({"A": range(50), "B": range(50)}) + self.gb_ewm = df.groupby("A").ewm(com=1.0) + + def time_groupby_mean(self, engine): + self.gb_ewm.mean(engine=engine) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 258c29c145721..2db46abca119c 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import Categorical, NaT, Series, date_range from .pandas_vb_common import tm @@ -36,6 +36,28 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInDatetime64: + def setup(self): + dti = date_range( + start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" + ) + self.ser = Series(dti) + self.subset = self.ser._values[::3] + self.cat_subset = Categorical(self.subset) + + def time_isin(self): + self.ser.isin(self.subset) + + def time_isin_cat_values(self): + self.ser.isin(self.cat_subset) + + def time_isin_mismatched_dtype(self): + self.ser.isin([1, 2]) + + def time_isin_empty(self): + self.ser.isin([]) + + class IsInFloat64: def setup(self): self.small = Series([1, 2], dtype=np.float64) @@ -90,6 +112,55 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) +class IsInLongSeriesLookUpDominates: + params = [ + ["int64", "int32", "float64", "float32", "object"], + [5, 1000], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], + ] + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10 ** 7 + if series_type == "random_hits": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone_hits": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber + self.series = Series(array).astype(dtype) + self.values = np.arange(MaxNumber).astype(dtype) + + def time_isin(self, dtypes, MaxNumber, series_type): + self.series.isin(self.values) + + +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10 ** 7 + if series_type == "random": + np.random.seed(42) + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + self.values = vals.astype(dtype) + M = 10 ** 6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) + + class NSort: params = ["first", "last", "all"] diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b1091ea7f60e4..c49742095e1d8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -40,7 +40,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate && \ python -m pip install --no-deps -U pip wheel setuptools && \ pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ - python setup.py build_ext -q -i -j2 && \ + python setup.py build_ext -q -j2 && \ python -m pip install --no-build-isolation -e . && \ pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 601a834d6306a..e510f4115b25f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -i -j 4 + python setup.py build_ext -q -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b5a6e32caa8e0..3eeee61f62a7e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -225,7 +225,7 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS01, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03 RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml index 67edc83a9d738..c4c84e73fa684 100644 --- a/ci/deps/azure-39.yaml +++ b/ci/deps/azure-39.yaml @@ -15,3 +15,8 @@ dependencies: - numpy - python-dateutil - pytz + + # optional dependencies + - pytables + - scipy + - pyarrow=1.0 diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index e93a86910bf34..4e442b10482a7 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -34,7 +34,7 @@ dependencies: - pyarrow>=0.17 - pytables>=3.5.1 - scipy - - xarray=0.12.0 + - xarray=0.12.3 - xlrd - xlsxwriter - xlwt diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 9b553fbc81a03..78d24c814840a 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then # GH#37455 windows py38 build appears to be running out of memory # skip collection of window tests - PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/" + PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/" fi echo $PYTEST_CMD diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 8984fa2d9a9be..c36422884f2ec 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -108,6 +108,12 @@ fi echo "activate pandas-dev" source activate pandas-dev +# Explicitly set an environment variable indicating that this is pandas' CI environment. +# +# This allows us to enable things like -Werror that shouldn't be activated in +# downstream CI jobs that may also build pandas from source. +export PANDAS_CI=1 + echo echo "remove any installed pandas package" echo "w/o removing anything else" @@ -131,7 +137,7 @@ conda list pandas # Make sure any error below is reported as such echo "[Build extensions]" -python setup.py build_ext -q -i -j2 +python setup.py build_ext -q -j2 echo "[Updating pip]" python -m pip install --no-deps -U pip wheel setuptools diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 41b2b7405fcb5..3c5a88333be56 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,7 +146,7 @@ Creating a development environment ---------------------------------- To test out code changes, you'll need to build pandas from source, which -requires a C compiler and Python environment. If you're making documentation +requires a C/C++ compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. @@ -183,7 +183,7 @@ See https://www.jetbrains.com/help/pycharm/docker.html for details. Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 .. _contributing.dev_c: @@ -195,6 +195,13 @@ operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which platform you're using. +If you have setup your environment using ``conda``, the packages ``c-compiler`` +and ``cxx-compiler`` will install a fitting compiler for your platform that is +compatible with the remaining conda packages. On Windows and macOS, you will +also need to install the SDKs as they have to be distributed separately. +These packages will be automatically installed by using ``pandas``'s +``environment.yml``. + **Windows** You will need `Build Tools for Visual Studio 2017 @@ -206,12 +213,33 @@ You will need `Build Tools for Visual Studio 2017 scrolling down to "All downloads" -> "Tools for Visual Studio 2019". In the installer, select the "C++ build tools" workload. +You can install the necessary components on the commandline using +`vs_buildtools.exe `_: + +.. code:: + + vs_buildtools.exe --quiet --wait --norestart --nocache ^ + --installPath C:\BuildTools ^ + --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^ + --add Microsoft.VisualStudio.Component.VC.v141 ^ + --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^ + --add Microsoft.VisualStudio.Component.Windows10SDK.17763 + +To setup the right paths on the commandline, call +``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``. + **macOS** -Information about compiler installation can be found here: +To use the ``conda``-based compilers, you will need to install the +Developer Tools using ``xcode-select --install``. Otherwise +information about compiler installation can be found here: https://devguide.python.org/setup/#macos -**Unix** +**Linux** + +For Linux-based ``conda`` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -243,11 +271,10 @@ Let us know if you have any difficulties by opening an issue or reaching out on Creating a Python environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Now that you have a C compiler, create an isolated pandas development -environment: +Now create an isolated pandas development environment: -* Install either `Anaconda `_ or `miniconda - `_ +* Install either `Anaconda `_, `miniconda + `_, or `miniforge `_ * Make sure your conda is up to date (``conda update conda``) * Make sure that you have :ref:`cloned the repository ` * ``cd`` to the pandas source directory @@ -268,7 +295,7 @@ We'll now kick off a three-step process: source activate pandas-dev # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 At this point you should be able to import pandas from your locally built version:: @@ -315,7 +342,7 @@ You'll need to have at least Python 3.6.1 installed on your system. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Unix**/**macOS with pyenv** @@ -339,7 +366,7 @@ Consult the docs for setting up pyenv `here `__. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Windows** @@ -365,7 +392,7 @@ should already exist. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 Creating a branch diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index ced5b686b8246..f8e6bda2085d8 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -35,7 +35,7 @@ We will not introduce new deprecations in patch releases. Deprecations will only be enforced in **major** releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the -deprecation removed in the next next major release (2.0.0). +deprecation removed in the next major release (2.0.0). .. note:: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index be32c5c14fdfc..e88875a9f679c 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -178,6 +178,16 @@ D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle & Google Colab. Here are some demos of the `grid `__ and `chart-builder `__. +`hvplot `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. +It can be loaded as a native pandas plotting backend via + +.. code:: python + + pd.set_option("plotting.backend", "hvplot") + .. _ecosystem.ide: IDE diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index df481e8c986f7..c823ad01f10bf 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -284,7 +284,7 @@ pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.12.0 pandas-like API for N-dimensional data +xarray 0.12.3 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.2.0 Excel reading xlwt 1.3.0 Excel writing diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 9d5649c37e92f..f7c5eaf242b34 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -30,7 +30,6 @@ public functions related to data types in pandas. series frame arrays - panel indexing offset_frequency window diff --git a/doc/source/reference/panel.rst b/doc/source/reference/panel.rst deleted file mode 100644 index 37d48c2dadf2e..0000000000000 --- a/doc/source/reference/panel.rst +++ /dev/null @@ -1,10 +0,0 @@ -{{ header }} - -.. _api.panel: - -===== -Panel -===== -.. currentmodule:: pandas - -``Panel`` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_ diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 24a47336b0522..e80dc1b57ff80 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -36,6 +36,7 @@ Style application Styler.where Styler.format Styler.set_precision + Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes Styler.set_caption diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index f05eb9cc40402..17d1809638d61 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -205,3 +205,10 @@ parameter: - ``min`` : lowest rank in the group - ``max`` : highest rank in the group - ``first`` : ranks assigned in the order they appear in the array + +.. _computation.windowing: + +Windowing functions +~~~~~~~~~~~~~~~~~~~ + +See :ref:`the window operations user guide ` for an overview of windowing functions. diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 905877cca61db..f2bb99dd2ebc0 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -439,7 +439,7 @@ Data Classes as introduced in `PEP557 can be passed into the DataFrame constructor. Passing a list of dataclasses is equivalent to passing a list of dictionaries. -Please be aware, that that all values in the list should be dataclasses, mixing +Please be aware, that all values in the list should be dataclasses, mixing types in the list would result in a TypeError. .. ipython:: python diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index e19dace572e59..d6081155b58db 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -524,6 +524,15 @@ index are the group names and whose values are the sizes of each group. grouped.describe() +Another aggregation example is to compute the number of unique values of each group. This is similar to the ``value_counts`` function, except that it only counts unique values. + +.. ipython:: python + + ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]] + df4 = pd.DataFrame(ll, columns=["A", "B"]) + df4 + df4.groupby("A")["B"].nunique() + .. note:: Aggregation functions **will not** return the groups that you are aggregating over diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 2dd8f0cb212b1..817ea3445f995 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -584,48 +584,20 @@ without using a temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df['r'] > 100]) -.. _indexing.deprecate_ix: -IX indexer is deprecated ------------------------- - -.. warning:: - - .. versionchanged:: 1.0.0 - - The ``.ix`` indexer was removed, in favor of the more strict ``.iloc`` and ``.loc`` indexers. +.. _combining_positional_and_label_based_indexing: -``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide -to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a -bit of user confusion over the years. +Combining positional and label-based indexing +--------------------------------------------- -The recommended methods of indexing are: - -* ``.loc`` if you want to *label* index. -* ``.iloc`` if you want to *positionally* index. +If you wish to get the 0th and the 2nd elements from the index in the 'A' column, you can do: .. ipython:: python dfd = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc')) - dfd - -Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. - -.. code-block:: ipython - - In [3]: dfd.ix[[0, 2], 'A'] - Out[3]: - a 1 - c 3 - Name: A, dtype: int64 - -Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. - -.. ipython:: python - dfd.loc[dfd.index[[0, 2]], 'A'] This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using @@ -1158,6 +1130,40 @@ Mask s.mask(s >= 0) df.mask(df >= 0) +.. _indexing.np_where: + +Setting with enlargement conditionally using :func:`numpy` +---------------------------------------------------------- + +An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`. +Combined with setting a new column, you can use it to enlarge a dataframe where the +values are determined conditionally. + +Consider you have two choices to choose from in the following dataframe. And you want to +set a new column color to 'green' when the second column has 'Z'. You can do the +following: + +.. ipython:: python + + df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')}) + df['color'] = np.where(df['col2'] == 'Z', 'green', 'red') + df + +If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say +corresponding to three conditions there are three choice of colors, with a fourth color +as a fallback, you can do the following. + +.. ipython:: python + + conditions = [ + (df['col2'] == 'Z') & (df['col1'] == 'A'), + (df['col2'] == 'Z') & (df['col1'] == 'B'), + (df['col1'] == 'B') + ] + choices = ['yellow', 'blue', 'purple'] + df['color'] = np.select(conditions, choices, default='black') + df + .. _indexing.query: The :meth:`~pandas.DataFrame.query` Method diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index be38736f493b5..2d5673fe53be3 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -117,7 +117,7 @@ dtype if needed. # coerce when needed s + 0.01 -These dtypes can operate as part of of ``DataFrame``. +These dtypes can operate as part of ``DataFrame``. .. ipython:: python diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index f1a28dc30dd68..d8998a9a0a6e1 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -194,7 +194,7 @@ behavior: }, index=[2, 3, 6, 7], ) - result = pd.concat([df1, df4], axis=1, sort=False) + result = pd.concat([df1, df4], axis=1) .. ipython:: python @@ -204,13 +204,6 @@ behavior: p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -.. warning:: - - The default behavior with ``join='outer'`` is to sort the other axis - (columns in this case). In a future version of pandas, the default will - be to not sort. We specified ``sort=False`` to opt in to the new - behavior now. - Here is the same thing with ``join='inner'``: .. ipython:: python diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 12dd72f761408..24f344488d1ca 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -793,7 +793,8 @@ "source": [ "The next option you have are \"table styles\".\n", "These are styles that apply to the table as a whole, but don't look at the data.\n", - "Certain stylings, including pseudo-selectors like `:hover` can only be used this way." + "Certain stylings, including pseudo-selectors like `:hover` can only be used this way.\n", + "These can also be used to set specific row or column based class selectors, as will be shown." ] }, { @@ -831,9 +832,32 @@ "The value for `props` should be a list of tuples of `('attribute', 'value')`.\n", "\n", "`table_styles` are extremely flexible, but not as fun to type out by hand.\n", - "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." + "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here.\n", + "\n", + "`table_styles` can be used to add column and row based class descriptors. For large tables this can increase performance by avoiding repetitive individual css for each cell, and it can also simplify style construction in some cases.\n", + "If `table_styles` is given as a dictionary each key should be a specified column or index value and this will map to specific class CSS selectors of the given column or row.\n", + "\n", + "Note that `Styler.set_table_styles` will overwrite existing styles but can be chained by setting the `overwrite` argument to `False`." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "html = html.set_table_styles({\n", + " 'B': [dict(selector='', props=[('color', 'green')])],\n", + " 'C': [dict(selector='td', props=[('color', 'red')])], \n", + " }, overwrite=False)\n", + "html" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, { "cell_type": "markdown", "metadata": {}, @@ -922,10 +946,12 @@ "- DataFrame only `(use Series.to_frame().style)`\n", "- The index and columns must be unique\n", "- No large repr, and performance isn't great; this is intended for summary DataFrames\n", - "- You can only style the *values*, not the index or columns\n", + "- You can only style the *values*, not the index or columns (except with `table_styles` above)\n", "- You can only apply styles, you can't insert new HTML entities\n", "\n", - "Some of these will be addressed in the future.\n" + "Some of these will be addressed in the future.\n", + "Performance can suffer when adding styles to each cell in a large DataFrame.\n", + "It is recommended to apply table or column based styles where possible to limit overall HTML length, as well as setting a shorter UUID to avoid unnecessary repeated data transmission. \n" ] }, { diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 169c0cfbbb87e..354c510b843dd 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -588,10 +588,12 @@ would include matching times on an included date: .. warning:: - Indexing ``DataFrame`` rows with strings is deprecated in pandas 1.2.0 and will be removed in a future version. Use ``frame.loc[dtstring]`` instead. + Indexing ``DataFrame`` rows with a *single* string with getitem (e.g. ``frame[dtstring]``) + is deprecated starting with pandas 1.2.0 (given the ambiguity whether it is indexing + the rows or selecting a column) and will be removed in a future version. The equivalent + with ``.loc`` (e.g. ``frame.loc[dtstring]``) is still supported. .. ipython:: python - :okwarning: dft = pd.DataFrame( np.random.randn(100000, 1), @@ -599,34 +601,30 @@ would include matching times on an included date: index=pd.date_range("20130101", periods=100000, freq="T"), ) dft - dft["2013"] + dft.loc["2013"] This starts on the very first time in the month, and includes the last date and time for the month: .. ipython:: python - :okwarning: dft["2013-1":"2013-2"] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python - :okwarning: dft["2013-1":"2013-2-28"] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python - :okwarning: dft["2013-1":"2013-2-28 00:00:00"] We are stopping on the included end-point as it is part of the index: .. ipython:: python - :okwarning: dft["2013-1-15":"2013-1-15 12:30:00"] @@ -652,7 +650,6 @@ We are stopping on the included end-point as it is part of the index: Slicing with string indexing also honors UTC offset. .. ipython:: python - :okwarning: df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific")) df @@ -704,15 +701,14 @@ If index resolution is second, then the minute-accurate timestamp gives a series_second.index.resolution series_second["2011-12-31 23:59"] -If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. +If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``.loc[]`` as well. .. ipython:: python - :okwarning: dft_minute = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index ) - dft_minute["2011-12-31 23"] + dft_minute.loc["2011-12-31 23"] .. warning:: @@ -2080,7 +2076,6 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python - :okwarning: ps["2011"] @@ -2090,7 +2085,7 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), ) dfp - dfp["2013-01-01 10H"] + dfp.loc["2013-01-01 10H"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 47ef1e9c8c4d7..05f8be091fa25 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -43,7 +43,7 @@ Concept Method Returned Object Rolling window ``rolling`` ``Rolling`` Yes Yes Weighted window ``rolling`` ``Window`` No No Expanding window ``expanding`` ``Expanding`` No Yes -Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No No +Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) ============================= ================= =========================== =========================== ======================== As noted above, some operations support specifying a window based on a time offset: diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 4de76510c6bc1..c12adb2f1334f 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -419,7 +419,7 @@ Bug fixes ~~~~~~~~~ - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have have a dtype of ``object`` (:issue:`1818`, + if the associated objects have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -430,8 +430,8 @@ Bug fixes - ``Series.str`` now supports iteration (:issue:`3638`). You can iterate over the individual elements of each string in the ``Series``. Each iteration yields - yields a ``Series`` with either a single character at each index of the - original ``Series`` or ``NaN``. For example, + a ``Series`` with either a single character at each index of the original + ``Series`` or ``NaN``. For example, .. ipython:: python :okwarning: diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 5b279a4973963..b59938a9b9c9b 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -923,7 +923,7 @@ Bug fixes - ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`) - ``HDFStore.select_as_coordinates`` and ``select_column`` works with a ``where`` clause that results in filters (:issue:`6177`) - Regression in join of non_unique_indexes (:issue:`6329`) -- Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) +- Issue with groupby ``agg`` with a single function and a mixed-type frame (:issue:`6337`) - Bug in ``DataFrame.replace()`` when passing a non- ``bool`` ``to_replace`` argument (:issue:`6332`) - Raise when trying to align on different levels of a MultiIndex assignment (:issue:`3738`) diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index 95ca925f18692..b5b25796fea73 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -136,7 +136,7 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. -- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. +- Added ability to export Categorical data to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. - Added support for ``searchsorted()`` on ``Categorical`` class (:issue:`8420`). Other enhancements: diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 39767684c01d0..269854111373f 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -6,7 +6,7 @@ Version 0.16.1 (May 11, 2015) {{ header }} -This is a minor bug-fix release from 0.16.0 and includes a a large number of +This is a minor bug-fix release from 0.16.0 and includes a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -72,7 +72,7 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv Out[4]: Index(['c', 'a', 'b'], dtype='object') -setting the index, will create create a ``CategoricalIndex`` +setting the index, will create a ``CategoricalIndex`` .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index 194bb61f2c1c8..37e8c64ea9ced 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -6,7 +6,7 @@ Version 0.16.2 (June 12, 2015) {{ header }} -This is a minor bug-fix release from 0.16.1 and includes a a large number of +This is a minor bug-fix release from 0.16.1 and includes a large number of bug fixes along some new features (:meth:`~DataFrame.pipe` method), enhancements, and performance improvements. We recommend that all users upgrade to this version. diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 636414cdab8d8..829c04dac9f2d 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -610,7 +610,7 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1 pd.Timestamp('2012-01-01') - ser -``NaT.isoformat()`` now returns ``'NaT'``. This change allows allows +``NaT.isoformat()`` now returns ``'NaT'``. This change allows ``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat (:issue:`12300`). diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 8ae5ea5726fe9..2cb8e13e9a18a 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -1167,7 +1167,7 @@ Other API changes - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) -- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) +- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) @@ -1315,7 +1315,7 @@ The recommended methods of indexing are: - ``.loc`` if you want to *label* index - ``.iloc`` if you want to *positionally* index. -Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `__. .. ipython:: python @@ -1663,11 +1663,11 @@ Indexing - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) - Bug in ``.reset_index()`` when raising error for index name already present in ``MultiIndex`` columns (:issue:`16120`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) -- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) +- Bug in the HTML display with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) -- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) +- Bug in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) - Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`) IO diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 6035b89aa8643..1bbbbdc7e5410 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -50,7 +50,7 @@ Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, su dtypes, including extension dtypes such as datetime with timezones. This functionality depends on either the `pyarrow `__ or `fastparquet `__ library. -For more details, see see :ref:`the IO docs on Parquet `. +For more details, see :ref:`the IO docs on Parquet `. .. _whatsnew_0210.enhancements.infer_objects: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9ef50045d5b5e..ce784231a47d2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1622,7 +1622,7 @@ Timedelta - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) - Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) - Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) @@ -1868,7 +1868,7 @@ Reshaping - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) - Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 8ff688eaa91e7..253ca4d4188e5 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -15,7 +15,7 @@ New features ~~~~~~~~~~~~ - :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` - :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (:issue:`296`) +- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to DataFrame (:issue:`296`) - :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (:issue:`289`) - :ref:`Added ` ``float_format`` option to ``Series.to_string`` - :ref:`Added ` ``skip_footer`` (:issue:`291`) and ``converters`` (:issue:`343`) options to ``read_csv`` and ``read_table`` diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index b34c2a5c6a07c..781054fc4de7c 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -81,7 +81,7 @@ Time Series changes and improvements timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only - time zone names as as strings are required. Time zone-aware timestamps are + time zone names as strings are required. Time zone-aware timestamps are equal if and only if their UTC timestamps match. Operations between time zone-aware time series with different time zones will result in a UTC-indexed time series. diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 323342cb43950..46c4ad4f35fe4 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -14,10 +14,15 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) +- Fixed regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) -- +- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) +- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) +- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) +- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). .. --------------------------------------------------------------------------- @@ -25,10 +30,15 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) -- Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) -- Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) -- +- Bug in pytables methods in python 3.9 (:issue:`38041`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.other: + +Other +~~~~~ +- Only set ``-Werror`` as a compiler flag in the CI jobs (:issue:`33315`, :issue:`33314`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 54d8ba1edea39..5d36c52da9f0d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -22,7 +22,7 @@ Optionally disallow duplicate labels control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to prevent accidental introduction of duplicate labels, which can affect downstream operations. -By default, duplicates continue to be allowed +By default, duplicates continue to be allowed. .. ipython:: python @@ -84,7 +84,7 @@ Support for binary file handles in ``to_csv`` :meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). -If Pandas does not automatically detect whether the file handle is opened in binary or text mode, +If pandas does not automatically detect whether the file handle is opened in binary or text mode, it is necessary to provide ``mode="wb"``. For example: @@ -104,7 +104,7 @@ Support for short caption and table position in ``to_latex`` a floating table position (:issue:`35281`) and a short caption (:issue:`36267`). -New keyword ``position`` is implemented to set the position. +The keyword ``position`` has been added to set the position. .. ipython:: python @@ -112,9 +112,9 @@ New keyword ``position`` is implemented to set the position. table = data.to_latex(position='ht') print(table) -Usage of keyword ``caption`` is extended. +Usage of the keyword ``caption`` has been extended. Besides taking a single string as an argument, -one can optionally provide a tuple of ``(full_caption, short_caption)`` +one can optionally provide a tuple ``(full_caption, short_caption)`` to add a short caption macro. .. ipython:: python @@ -141,12 +141,12 @@ parser by default should have no impact on performance. (:issue:`17154`) Experimental nullable data types for float data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, -an extension data type dedicated to floating point data that can hold the +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`. +These are extension data types dedicated to floating point data that can hold the ``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). While the default float data type already supports missing values using ``np.nan``, -this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing +these new data types use ``pd.NA`` (and its corresponding behaviour) as the missing value indicator, in line with the already existing nullable :ref:`integer ` and :ref:`boolean ` data types. @@ -180,7 +180,7 @@ Alternatively, you can also use the dtype object: .. warning:: - Experimental: the new floating data types are currently experimental, and its + Experimental: the new floating data types are currently experimental, and their behaviour or API may still change without warning. Especially the behaviour regarding NaN (distinct from NA missing values) is subject to change. @@ -189,8 +189,8 @@ Alternatively, you can also use the dtype object: Index/column name preservation when aggregating ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, Pandas -will attempt to preserve index (and column) names whenever possible (:issue:`35847`). +When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, pandas +will now attempt to preserve index and column names whenever possible (:issue:`35847`). In the case where all inputs share a common name, this name will be assigned to the result. When the input names do not all agree, the result will be unnamed. Here is an example where the index name is preserved: @@ -204,37 +204,59 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. +.. _whatsnew_120.groupby_ewm: + +Groupby supports EWM operations directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`.DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`). + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)}) + df + df.groupby('A').ewm(com=1.0).mean() + +Additionally ``mean`` supports execution via `Numba `__ with +the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency +to use this feature. + .. _whatsnew_120.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -- Added ``day_of_week``(compatibility alias ``dayofweek``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) -- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) -- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) +- Added ``day_of_week`` (compatibility alias ``dayofweek``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) +- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`) - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`) -- ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) -- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) -- +- :meth:`.Styler.set_table_styles` now allows the direct styling of rows and columns and can be chained (:issue:`35607`) +- :class:`.Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- :meth:`.Rolling.mean` and :meth:`.Rolling.sum` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). +- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) -- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). -- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) -- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`) +- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) +- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) +- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) +- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) - :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) -- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) +- :class:`DataFrame` now supports the ``divmod`` operation (:issue:`37165`) - :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) -- :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) -- :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) +- :class:`.Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) +- :class:`.DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) -- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) -- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) -- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) +- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) +- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) +- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) .. --------------------------------------------------------------------------- @@ -252,7 +274,7 @@ determines whether to exclude object-dtype columns on a column-by-column basis, instead of checking if *all* object-dtype columns can be considered boolean. This prevents pathological behavior where applying the reduction on a subset -of columns could result in a larger :class:`Series` result. See (:issue:`37799`). +of columns could result in a larger Series result. See (:issue:`37799`). .. ipython:: python @@ -284,6 +306,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) In [6]: df[["B", "C"]].all(bool_only=True) +Other DataFrame reductions with ``numeric_only=None`` will also avoid +this pathological behavior (:issue:`37827`): + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.mean() + Out[3]: Series([], dtype: float64) + + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 + +*New behavior*: + +.. ipython:: python + + df.mean() + + df[["A"]].mean() + +Moreover, DataFrame reductions with ``numeric_only=None`` will now be +consistent with their Series counterparts. In particular, for +reductions where the Series method raises ``TypeError``, the +DataFrame reduction will now consider that column non-numeric +instead of casting to a NumPy array which may have different semantics (:issue:`36076`, +:issue:`28949`, :issue:`21020`). + +.. ipython:: python + + ser = pd.Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.any() + Out[5]: + A True + dtype: bool + +*New behavior*: + +.. ipython:: python + + df.any() + + .. _whatsnew_120.api_breaking.python: Increased minimum version for Python @@ -370,11 +449,11 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`DatetimeArray.take`, :meth:`TimedeltaArray.take`, :meth:`PeriodArray.take` now raises ``TypeError`` instead of ``ValueError`` (:issue:`37733`) -- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises ``TypeError`` instead of ``ValueError`` (:issue:`37733`) +- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) +- Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`.DatetimeArray.take`, :meth:`TimedeltaArray.take`, or :meth:`PeriodArray.take` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) -- Attempting to reindex a :class:`Series` with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises ``TypeError`` instead of ``ValueError`` (:issue:`37733`) +- Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) .. --------------------------------------------------------------------------- @@ -383,24 +462,31 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- Deprecated parameter ``dtype`` in :meth:`~Index.copy` on method all index classes. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) -- Deprecated parameters ``levels`` and ``codes`` in :meth:`~MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) +- Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) +- Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) -- The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) -- Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) -- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) +- The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) +- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` + (given the ambiguity whether it is indexing the rows or selecting a column), use + ``df.loc[string]`` instead (:issue:`36179`) +- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`.DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) - The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) +- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) - :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) - :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`) - +- Partial slicing on unordered :class:`.DatetimeIndex` objects with keys that are not in the index is deprecated and will be removed in a future version (:issue:`18531`) +- The ``how`` keyword in :meth:`PeriodIndex.astype` is deprecated and will be removed in a future version, use ``index.to_timestamp(how=how)`` instead (:issue:`37982`) +- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) +- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) +- The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) .. --------------------------------------------------------------------------- @@ -411,20 +497,22 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) -- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) -- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) -- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) -- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) -- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- Performance improvement in :meth:`.GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvements when creating :meth:`Series.map` from a huge dictionary (:issue:`34717`) +- Performance improvement in :meth:`.GroupBy.transform` with the ``numba`` engine (:issue:`36240`) +- :class:`.Styler` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) +- Performance improvement in :func:`to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) +- Performance improvement in setting values on an :class:`IntervalArray` (:issue:`36310`) - The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) -- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) -- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) +- Performance improvement in :meth:`.RollingGroupby.count` (:issue:`35625`) +- Small performance decrease to :meth:`.Rolling.min` and :meth:`.Rolling.max` for fixed windows (:issue:`36567`) - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) -- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) +- Faster ``dir`` calls when the object has many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) -- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) +- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) +- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) +- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) .. --------------------------------------------------------------------------- @@ -435,35 +523,41 @@ Bug fixes Categorical ^^^^^^^^^^^ -- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`) +- :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) +- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) +- Datetimelike ^^^^^^^^^^^^ -- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) +- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) -- Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) -- Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) -- Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) -- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) -- Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) -- Bug in :meth:`DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) -- Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) -- :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) -- Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) -- :meth:`to_json` and :meth:`read_json` now implements timezones parsing when orient structure is 'table'. -- :meth:`astype` now attempts to convert to 'datetime64[ns, tz]' directly from 'object' with inferred timezone from string (:issue:`35973`). -- Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) -- Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) -- Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) +- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) +- Bug in :meth:`.DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`.DatetimeIndex` (:issue:`35690`) +- Bug in :meth:`.DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) +- Bug in :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or :class:`Period` dtype placement of ``NaT`` values being inconsistent with NumPy (:issue:`36176`, :issue:`36254`) +- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) +- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) +- Bug in :class:`.DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) +- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) +- Bug in :meth:`.DatetimeIndex.equals` and :meth:`.TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement timezone parsing when orient structure is ``table`` (:issue:`35973`) +- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred timezone from string (:issue:`35973`) +- Bug in :meth:`.TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) +- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) +- Bug in adding a :class:`.BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) - Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) +- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) Timedelta ^^^^^^^^^ -- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) -- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`) +- Bug in :class:`.TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) +- Bug in parsing of ISO 8601 durations in :class:`Timedelta` and :func:`to_datetime` (:issue:`29773`, :issue:`36204`) - Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`) +- Bug in :class:`Timedelta` incorrectly truncating to sub-second portion of a string input when it has precision higher than nanoseconds (:issue:`36738`) Timezones ^^^^^^^^^ @@ -477,17 +571,17 @@ Numeric - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) -- Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) -- Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) +- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) +- Bug in :mod:`pandas.testing` module functions when used with ``check_exact=False`` on complex numeric types (:issue:`28235`) - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) -- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :class:`.IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) - Bug in :class:`MultiIndex` comparison with tuple incorrectly treating tuple as array-like (:issue:`21517`) - Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) - Bug in :class:`DataFrame` arithmetic ops incorrectly accepting keyword arguments (:issue:`36843`) -- Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`) +- Bug in :class:`.IntervalArray` comparisons with :class:`Series` not returning Series (:issue:`36908`) - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) -- Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) Conversion @@ -499,38 +593,53 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) -- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype :class:`Series` containing only numeric strings and ``NA`` (:issue:`37262`) +- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype Series containing only numeric strings and ``NA`` (:issue:`37262`) - Interval ^^^^^^^^ + +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`) - Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`) -- +- Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`) +- Bug in :meth:`IntervalArray.astype` incorrectly dropping dtype information with a :class:`CategoricalDtype` object (:issue:`37984`) - Indexing ^^^^^^^^ -- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) +- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) -- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) - Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) - Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) - Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`) - Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`) - Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) -- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`) -- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) +- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`) +- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) +- Bug in :meth:`DataFrame.loc.__setitem__` expanding an empty :class:`DataFrame` with mixed dtypes (:issue:`37932`) - Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) +- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) +- Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`) +- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`) +- Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) +- Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) +- Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) +- Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) +- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) +- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) Missing ^^^^^^^ -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) +- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) +- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) - MultiIndex @@ -539,84 +648,103 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) +- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) I/O ^^^ - :func:`read_sas` no longer leaks resources on failure (:issue:`35566`) -- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) -- In :meth:`read_csv` ``float_precision='round_trip'`` now handles ``decimal`` and ``thousands`` parameters (:issue:`35365`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- Bug in :meth:`read_csv` with ``float_precision='round_trip'`` did not handle ``decimal`` and ``thousands`` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) - :meth:`to_csv` passes compression arguments for ``'gzip'`` always to ``gzip.GzipFile`` (:issue:`28103`) - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) -- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) -- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, :issue:`32392`) +- :meth:`DataFrame.to_pickle`, :meth:`Series.to_pickle`, and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, :issue:`29570`) - Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) - Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) -- Bug in :meth:`to_json` with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) +- Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when used with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) - Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) - Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`) - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) -- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) -- Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty DataFrame with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` was dropping timezone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) - Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) -- :meth:`to_excel` and :meth:`to_markdown` support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) +- :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) +- Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) +- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) +- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) +- :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) +- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) + +Period +^^^^^^ + +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Period` dtypes would be converted to object dtypes (:issue:`34871`) Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) -- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) +- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`) +- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`) - Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) -- Bug in :meth:`DataFrameGroupBy.boxplot` when ``subplots=False``, a KeyError would raise (:issue:`16748`) +- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was + indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`) +- Bug in :meth:`.DataFrameGroupBy.boxplot` when ``subplots=False`` would raise a ``KeyError`` (:issue:`16748`) +- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no ``sharey`` parameter was passed (:issue:`37942`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) -- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) -- Bug in :meth:`DataFrame.resample(...)` that would throw a ``ValueError`` when resampling from "D" to "24H" over a transition into daylight savings time (DST) (:issue:`35219`) -- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) -- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`.DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`.DataFrameGroupBy.apply` that would sometimes throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) +- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) +- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) -- Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) +- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) -- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) -- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) +- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`) - Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) -- Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) -- Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) -- Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). -- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) -- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) -- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) -- Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) -- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) +- Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) +- Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`) +- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`.Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) +- Using :meth:`.Rolling.var` instead of :meth:`.Rolling.std` avoids numerical issues for :meth:`.Rolling.corr` when :meth:`.Rolling.var` is still within floating point precision while :meth:`.Rolling.std` is not (:issue:`31286`) +- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.Resampler.quantile` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) +- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` returned wrong values for :class:`.BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) -- Bug in :meth:`DataFrameGroupBy.head`, :meth:`DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) -- Bug in :meth:`DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) +- Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) -- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) +- Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) +- Bug in :meth:`DataFrame.stack` where an empty DataFrame.stack would raise an error (:issue:`36113`). Now returning an empty Series with empty MultiIndex. +- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ValueError. (:issue:`36113`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) -- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) -- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns both multiindexed (:issue:`36360`) +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was a dictionary (:issue:`35811`) +- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns are both multiindexed (:issue:`36360`) - Bug in :meth:`DataFrame.pivot` modified ``index`` argument when ``columns`` was passed but ``values`` was not (:issue:`37635`) -- Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) -- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) -- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`) +- Bug in :meth:`DataFrame.join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) +- Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) +- Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) +- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) +- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) Sparse ^^^^^^ @@ -627,24 +755,26 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) -- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`) -- Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`) -- Fixed an inconsistency in :class:`PeriodArray`'s ``__init__`` signature to those of :class:`DatetimeArray` and :class:`TimedeltaArray` (:issue:`37289`) -- Reductions for :class:`BooleanArray`, :class:`Categorical`, :class:`DatetimeArray`, :class:`FloatingArray`, :class:`IntegerArray`, :class:`PeriodArray`, :class:`TimedeltaArray`, and :class:`PandasArray` are now keyword-only methods (:issue:`37541`) +- Fixed bug where :class:`DataFrame` column set to scalar extension type via a dict instantiation was considered an object type rather than the extension type (:issue:`35965`) +- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`28488`) +- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) +- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) +- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) Other ^^^^^ -- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising an ``AssertionError`` instead of a ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) +- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series and DataFrames (:issue:`28283`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`) - Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) -- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`) -- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) -- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) -- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). -- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) +- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`) +- Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`) +- Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) +- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) +- Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) +- Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index 77a9c5fd4822d..b99b856187fb6 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,9 @@ dependencies: - asv # building + # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. + - c-compiler + - cxx-compiler - cython>=0.29.21 # code checks diff --git a/pandas/__init__.py b/pandas/__init__.py index cf7ae2505b72d..cc5d835a52833 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -33,7 +33,7 @@ raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build the C extensions first." + "'python setup.py build_ext --force' to build the C extensions first." ) from e from pandas._config import ( @@ -189,25 +189,10 @@ # GH 27101 -# TODO: remove Panel compat in 1.0 def __getattr__(name): import warnings - if name == "Panel": - - warnings.warn( - "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - class Panel: - pass - - return Panel - - elif name == "datetime": + if name == "datetime": warnings.warn( "The pandas.datetime class is deprecated " "and will be removed from pandas in a future version. " diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 438d9fa625737..24156c88f0d76 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -903,13 +903,12 @@ def group_last(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -939,7 +938,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -961,7 +960,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -986,7 +985,8 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, int64_t rank=1 + const int64_t[:] labels, + int64_t min_count=-1, int64_t rank=1 ): """ Only aggregates on axis=0 @@ -1003,6 +1003,7 @@ def group_nth(rank_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -1033,7 +1034,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1057,7 +1058,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1294,13 +1295,12 @@ def group_max(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) @@ -1337,11 +1337,12 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break else: + out[i, j] = nan_val else: out[i, j] = maxx[i, j] @@ -1369,13 +1370,12 @@ def group_min(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) @@ -1411,7 +1411,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 75c273b35ee7d..7b630c264753f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,13 +1,27 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( + float32_t, float64_t, + int8_t, + int16_t, + int32_t, int64_t, + kh_float32_t, kh_float64_t, + kh_int8_t, + kh_int16_t, + kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, + kh_uint8_t, + kh_uint16_t, + kh_uint32_t, kh_uint64_t, + uint8_t, + uint16_t, + uint32_t, uint64_t, ) @@ -28,12 +42,54 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) +cdef class UInt32HashTable(HashTable): + cdef kh_uint32_t *table + + cpdef get_item(self, uint32_t val) + cpdef set_item(self, uint32_t key, Py_ssize_t val) + +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + cpdef get_item(self, int32_t val) + cpdef set_item(self, int32_t key, Py_ssize_t val) + +cdef class UInt16HashTable(HashTable): + cdef kh_uint16_t *table + + cpdef get_item(self, uint16_t val) + cpdef set_item(self, uint16_t key, Py_ssize_t val) + +cdef class Int16HashTable(HashTable): + cdef kh_int16_t *table + + cpdef get_item(self, int16_t val) + cpdef set_item(self, int16_t key, Py_ssize_t val) + +cdef class UInt8HashTable(HashTable): + cdef kh_uint8_t *table + + cpdef get_item(self, uint8_t val) + cpdef set_item(self, uint8_t key, Py_ssize_t val) + +cdef class Int8HashTable(HashTable): + cdef kh_int8_t *table + + cpdef get_item(self, int8_t val) + cpdef set_item(self, int8_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) +cdef class Float32HashTable(HashTable): + cdef kh_float32_t *table + + cpdef get_item(self, float32_t val) + cpdef set_item(self, float32_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 5a0cddb0af197..963fddd4d5af9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,48 +13,14 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport ( - kh_destroy_float64, - kh_destroy_int64, - kh_destroy_pymap, - kh_destroy_str, - kh_destroy_uint64, - kh_exist_float64, - kh_exist_int64, - kh_exist_pymap, - kh_exist_str, - kh_exist_uint64, - kh_float64_t, - kh_get_float64, - kh_get_int64, - kh_get_pymap, - kh_get_str, - kh_get_strbox, - kh_get_uint64, - kh_init_float64, - kh_init_int64, - kh_init_pymap, - kh_init_str, - kh_init_strbox, - kh_init_uint64, - kh_int64_t, - kh_put_float64, - kh_put_int64, - kh_put_pymap, - kh_put_str, - kh_put_strbox, - kh_put_uint64, - kh_resize_float64, - kh_resize_int64, - kh_resize_pymap, - kh_resize_str, - kh_resize_uint64, - kh_str_t, - khiter_t, -) +from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t from pandas._libs.missing cimport checknull +def get_hashtable_trace_domain(): + return KHASH_TRACE_DOMAIN + + cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index da91fa69b0dec..b582ed1533a8e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +{{py: + +# name +cimported_types = ['float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, +) +{{endfor}} + # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- @@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), - ('UInt64', 'uint64', 'uint64_t')] + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} @@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + Int32VectorData + Int16VectorData + Int8VectorData UInt64VectorData + UInt32VectorData + UInt16VectorData + UInt8VectorData Float64VectorData + Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t')] + ('Int64', 'int64', 'int64_t'), + ('Float32', 'float32', 'float32_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('Int32', 'int32', 'int32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('Int16', 'int16', 'int16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Int8', 'int8', 'int8_t')] }} @@ -253,15 +303,22 @@ cdef class HashTable: {{py: -# name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'np.nan'), - ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'NPY_NAT')] +# name, dtype, float_group +dtypes = [('Float64', 'float64', True), + ('UInt64', 'uint64', False), + ('Int64', 'int64', False), + ('Float32', 'float32', True), + ('UInt32', 'uint32', False), + ('Int32', 'int32', False), + ('UInt16', 'uint16', False), + ('Int16', 'int16', False), + ('UInt8', 'uint8', False), + ('Int8', 'int8', False)] }} -{{for name, dtype, float_group, default_na_value in dtypes}} +{{for name, dtype, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -287,9 +344,11 @@ cdef class {{name}}HashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, {{dtype}}_t val): cdef: @@ -430,7 +489,7 @@ cdef class {{name}}HashTable(HashTable): # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: - na_value2 = {{default_na_value}} + na_value2 = 0 with nogil: for i in range(n): @@ -612,10 +671,11 @@ cdef class StringHashTable(HashTable): self.table = NULL def sizeof(self, deep=False): - """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(char *) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(char *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, str val): cdef: @@ -937,9 +997,11 @@ cdef class PyObjectHashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(PyObject *) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, object val): cdef: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4a466ada765ca..7c5afa4ff6b27 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), + ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), + ('uint32', 'uint32', 'uint32_t'), + ('uint16', 'uint16', 'uint16_t'), + ('uint8', 'uint8', 'uint8_t'), ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t')] + ('int64', 'int64', 'int64_t'), + ('int32', 'int32', 'int32_t'), + ('int16', 'int16', 'int16_t'), + ('int8', 'int8', 'int8_t')] }} @@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - {{if dtype == 'float64'}} + {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: {{else}} if True: @@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), + ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), + ('int32', 'int32_t', 'int32', 'int32'), + ('int16', 'int16_t', 'int16', 'int16'), + ('int8', 'int8_t', 'int8', 'int8'), ('uint64', 'uint64_t', 'uint64', 'uint64'), + ('uint32', 'uint32_t', 'uint32', 'uint32'), + ('uint16', 'uint16_t', 'uint16', 'uint16'), + ('uint8', 'uint8_t', 'uint8', 'uint8'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index c7b67667bda17..69680e472bbc2 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -10,21 +10,21 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, hashtable_name -dtypes = [('Float64', 'float64', 'Float64'), - ('Float32', 'float32', 'Float64'), - ('Int64', 'int64', 'Int64'), - ('Int32', 'int32', 'Int64'), - ('Int16', 'int16', 'Int64'), - ('Int8', 'int8', 'Int64'), - ('UInt64', 'uint64', 'UInt64'), - ('UInt32', 'uint32', 'UInt64'), - ('UInt16', 'uint16', 'UInt64'), - ('UInt8', 'uint8', 'UInt64'), +# name, dtype +dtypes = [('Float64', 'float64'), + ('Float32', 'float32'), + ('Int64', 'int64'), + ('Int32', 'int32'), + ('Int16', 'int16'), + ('Int8', 'int8'), + ('UInt64', 'uint64'), + ('UInt32', 'uint32'), + ('UInt16', 'uint16'), + ('UInt8', 'uint8'), ] }} -{{for name, dtype, hashtable_name in dtypes}} +{{for name, dtype in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -32,7 +32,7 @@ cdef class {{name}}Engine(IndexEngine): # returns an ndarray with dtype {{dtype}}_t cdef _make_hash_table(self, Py_ssize_t n): - return _hash.{{hashtable_name}}HashTable(n) + return _hash.{{name}}HashTable(n) {{if name not in {'Float64', 'Float32'} }} cdef _check_type(self, object val): @@ -41,9 +41,7 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} cdef void _call_map_locations(self, values): - # self.mapping is of type {{hashtable_name}}HashTable, - # so convert dtype of values - self.mapping.map_locations(algos.ensure_{{hashtable_name.lower()}}(values)) + self.mapping.map_locations(algos.ensure_{{name.lower()}}(values)) cdef _maybe_get_bool_indexer(self, object val): cdef: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index f8bcbcfb158b5..10becdce5d6dd 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -179,7 +179,8 @@ cdef class IntervalMixin: return (self.right == self.left) & (self.closed != 'both') def _check_closed_matches(self, other, name='other'): - """Check if the closed attribute of `other` matches. + """ + Check if the closed attribute of `other` matches. Note that 'left' and 'right' are considered different from 'both'. diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 1bb3a158b4b1a..0d0c5ae058b21 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,8 +1,21 @@ from cpython.object cimport PyObject -from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) cdef extern from "khash_python.h": + const int KHASH_TRACE_DOMAIN + ctypedef uint32_t khint_t ctypedef khint_t khiter_t @@ -67,72 +80,6 @@ cdef extern from "khash_python.h": void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil - ctypedef struct kh_int64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int64_t *keys - size_t *vals - - kh_int64_t* kh_init_int64() nogil - void kh_destroy_int64(kh_int64_t*) nogil - void kh_clear_int64(kh_int64_t*) nogil - khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - void kh_resize_int64(kh_int64_t*, khint_t) nogil - khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - void kh_del_int64(kh_int64_t*, khint_t) nogil - - bint kh_exist_int64(kh_int64_t*, khiter_t) nogil - - ctypedef uint64_t khuint64_t - - ctypedef struct kh_uint64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - khuint64_t *keys - size_t *vals - - kh_uint64_t* kh_init_uint64() nogil - void kh_destroy_uint64(kh_uint64_t*) nogil - void kh_clear_uint64(kh_uint64_t*) nogil - khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil - void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil - void kh_del_uint64(kh_uint64_t*, khint_t) nogil - - bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil - - ctypedef struct kh_float64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - float64_t *keys - size_t *vals - - kh_float64_t* kh_init_float64() nogil - void kh_destroy_float64(kh_float64_t*) nogil - void kh_clear_float64(kh_float64_t*) nogil - khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - void kh_resize_float64(kh_float64_t*, khint_t) nogil - khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - void kh_del_float64(kh_float64_t*, khint_t) nogil - - bint kh_exist_float64(kh_float64_t*, khiter_t) nogil - - ctypedef struct kh_int32_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int32_t *keys - size_t *vals - - kh_int32_t* kh_init_int32() nogil - void kh_destroy_int32(kh_int32_t*) nogil - void kh_clear_int32(kh_int32_t*) nogil - khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - void kh_resize_int32(kh_int32_t*, khint_t) nogil - khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - void kh_del_int32(kh_int32_t*, khint_t) nogil - - bint kh_exist_int32(kh_int32_t*, khiter_t) nogil - # sweep factorize ctypedef struct kh_strbox_t: @@ -150,3 +97,5 @@ cdef extern from "khash_python.h": void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + +include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in new file mode 100644 index 0000000000000..db8d3e0b19417 --- /dev/null +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -0,0 +1,42 @@ +""" +Template for wrapping khash-tables for each primitive `dtype` + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +{{py: + +# name, c_type +primitive_types = [('int64', 'int64_t'), + ('uint64', 'uint64_t'), + ('float64', 'float64_t'), + ('int32', 'int32_t'), + ('uint32', 'uint32_t'), + ('float32', 'float32_t'), + ('int16', 'int16_t'), + ('uint16', 'uint16_t'), + ('int8', 'int8_t'), + ('uint8', 'uint8_t'), + ] +}} + +{{for name, c_type in primitive_types}} + +cdef extern from "khash_python.h": + ctypedef struct kh_{{name}}_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + size_t *vals + + kh_{{name}}_t* kh_init_{{name}}() nogil + void kh_destroy_{{name}}(kh_{{name}}_t*) nogil + void kh_clear_{{name}}(kh_{{name}}_t*) nogil + khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil + void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil + khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil + void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil + + bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil + +{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0b0334d52c1e9..1ca18bae4e2c4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -118,6 +118,8 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: def is_scalar(val: object) -> bool: """ + Return True if given object is scalar. + Parameters ---------- val : object @@ -634,7 +636,7 @@ cpdef ndarray[object] ensure_string_array( ---------- arr : array-like The values to be converted to str, if needed. - na_value : Any + na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. @@ -927,6 +929,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, def is_float(obj: object) -> bool: """ + Return True if given object is float. + Returns ------- bool @@ -936,6 +940,8 @@ def is_float(obj: object) -> bool: def is_integer(obj: object) -> bool: """ + Return True if given object is integer. + Returns ------- bool @@ -945,6 +951,8 @@ def is_integer(obj: object) -> bool: def is_bool(obj: object) -> bool: """ + Return True if given object is boolean. + Returns ------- bool @@ -954,6 +962,8 @@ def is_bool(obj: object) -> bool: def is_complex(obj: object) -> bool: """ + Return True if given object is complex. + Returns ------- bool @@ -971,7 +981,7 @@ cpdef bint is_interval(object obj): def is_period(val: object) -> bool: """ - Return a boolean if this is a Period object. + Return True if given object is Period. Returns ------- diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9459cd297c758..ad6329c588bbe 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -44,7 +44,9 @@ cdef class _BaseGrouper: Slider islider, Slider vslider): if cached_typ is None: cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) + cached_typ = self.typ( + vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name + ) else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 61a4e80ea8cbc..bb56b2fe2d145 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -115,6 +115,24 @@ int main() { #include "../inline_helper.h" +// hooks for memory allocator, C-runtime allocator used per default +#ifndef KHASH_MALLOC +#define KHASH_MALLOC malloc +#endif + +#ifndef KHASH_REALLOC +#define KHASH_REALLOC realloc +#endif + +#ifndef KHASH_CALLOC +#define KHASH_CALLOC calloc +#endif + +#ifndef KHASH_FREE +#define KHASH_FREE free +#endif + + #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu @@ -122,14 +140,23 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; +#endif + +#if UINT_MAX == 0xffffu +typedef unsigned int khint16_t; +#elif USHRT_MAX == 0xffffu +typedef unsigned short khint16_t; +#endif + +#if UCHAR_MAX == 0xffu +typedef unsigned char khint8_t; #endif typedef double khfloat64_t; +typedef float khfloat32_t; typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -256,14 +283,14 @@ static const double __ac_HASH_UPPER = 0.77; khval_t *vals; \ } kh_##name##_t; \ SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ + KHASH_FREE(h->vals); \ + KHASH_FREE(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ @@ -296,11 +323,11 @@ static const double __ac_HASH_UPPER = 0.77; if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ @@ -333,10 +360,10 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); /* free the working space */ \ + KHASH_FREE(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ @@ -588,15 +615,25 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ + +// we implicitly convert signed int to unsigned int, thus potential overflows +// for operations (<<,*,+) don't trigger undefined behavior, also >>-operator +// is implementation defined for signed ints if sign-bit is set. +// because we never really "get" the keys, there will be no convertion from +// unsigend int to (signed) int (which would be implementation defined behavior) +// this holds also for 64-, 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -607,11 +644,34 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +/*! @function + @abstract Instantiate a hash map containing 16bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 8bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + + typedef const char *kh_cstr_t; /*! @function @@ -634,12 +694,23 @@ typedef const char *kh_cstr_t; #define kh_exist_float64(h, k) (kh_exist(h, k)) #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) +#define kh_exist_uint32(h, k) (kh_exist(h, k)) +#define kh_exist_int16(h, k) (kh_exist(h, k)) +#define kh_exist_uint16(h, k) (kh_exist(h, k)) +#define kh_exist_int8(h, k) (kh_exist(h, k)) +#define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_MAP_INIT_INT8(int8, size_t) +KHASH_MAP_INIT_UINT8(uint8, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index aebc229abddd2..8e4e61b4f3077 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,6 +1,59 @@ #include #include +// khash should report usage to tracemalloc +#if PY_VERSION_HEX >= 0x03060000 +#include +#if PY_VERSION_HEX < 0x03070000 +#define PyTraceMalloc_Track _PyTraceMalloc_Track +#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack +#endif +#else +#define PyTraceMalloc_Track(...) +#define PyTraceMalloc_Untrack(...) +#endif + + +static const int KHASH_TRACE_DOMAIN = 424242; +void *traced_malloc(size_t size){ + void * ptr = malloc(size); + if(ptr!=NULL){ + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void *traced_calloc(size_t num, size_t size){ + void * ptr = calloc(num, size); + if(ptr!=NULL){ + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size); + } + return ptr; +} + +void *traced_realloc(void* old_ptr, size_t size){ + void * ptr = realloc(old_ptr, size); + if(ptr!=NULL){ + if(old_ptr != ptr){ + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); + } + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void traced_free(void* ptr){ + if(ptr!=NULL){ + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); + } + free(ptr); +} + + +#define KHASH_MALLOC traced_malloc +#define KHASH_REALLOC traced_realloc +#define KHASH_CALLOC traced_calloc +#define KHASH_FREE traced_free #include "khash.h" // Previously we were using the built in cpython hash function for doubles @@ -16,6 +69,11 @@ // GH 13436 showed that _Py_HashDouble doesn't work well with khash // GH 28303 showed, that the simple xoring-version isn't good enough // See GH 36729 for evaluation of the currently used murmur2-hash version +// An interesting alternative to expensive murmur2-hash would be to change +// the probing strategy and use e.g. the probing strategy from CPython's +// implementation of dicts, which shines for smaller sizes but is more +// predisposed to superlinear running times (see GH 36729 for comparison) + khint64_t PANDAS_INLINE asint64(double key) { khint64_t val; @@ -23,6 +81,12 @@ khint64_t PANDAS_INLINE asint64(double key) { return val; } +khint32_t PANDAS_INLINE asint32(float key) { + khint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; +} + #define ZERO_HASH 0 #define NAN_HASH 0 @@ -39,13 +103,31 @@ khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ return murmur2_64to32(as_int); } -#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) +khint32_t PANDAS_INLINE kh_float32_hash_func(float val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint32_t as_int = asint32(val); + return murmur2_32to32(as_int); +} + +#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) + +KHASH_MAP_INIT_FLOAT32(float32, size_t) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); @@ -99,7 +181,7 @@ typedef struct { typedef kh_str_starts_t* p_kh_str_starts_t; p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { - kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t)); + kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); result->table = kh_init_str(); return result; } @@ -122,7 +204,7 @@ khint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { kh_destroy_str(table->table); - free(table); + KHASH_FREE(table); } void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index dbd094905cf24..1339dee954603 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1403,6 +1403,19 @@ cdef class BusinessDay(BusinessMixin): cdef class BusinessHour(BusinessMixin): """ DateOffset subclass representing possibly n business hours. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + start : str, default "09:00" + Start time of your custom business hour in 24h format. + end : str, default: "17:00" + End time of your custom business hour in 24h format. """ _prefix = "BH" @@ -3251,6 +3264,19 @@ cdef class CustomBusinessDay(BusinessDay): cdef class CustomBusinessHour(BusinessHour): """ DateOffset subclass representing possibly n custom business days. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + start : str, default "09:00" + Start time of your custom business hour in 24h format. + end : str, default: "17:00" + End time of your custom business hour in 24h format. """ _prefix = "CBH" diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 29e8c58055f9e..e4b19d844dcab 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -405,9 +405,11 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: m = 10**(3 -len(frac)) * 1000 * 1000 elif len(frac) > 3 and len(frac) <= 6: m = 10**(6 -len(frac)) * 1000 - else: + elif len(frac) > 6 and len(frac) <= 9: m = 10**(9 -len(frac)) - + else: + m = 1 + frac = frac[:9] r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -1143,6 +1145,9 @@ class Timedelta(_Timedelta): Notes ----- The ``.value`` attribute is always in ns. + + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds. """ def __new__(cls, object value=_no_input, unit=None, **kwargs): diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index f08a86b1262e6..1049682af08e8 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -426,7 +426,7 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): int64 ndarray of converted """ cdef: - int64_t[:] converted + const int64_t[:] converted if len(vals) == 0: return np.array([], dtype=np.int64) @@ -437,7 +437,7 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): +cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. @@ -459,7 +459,7 @@ cdef int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): str typ if is_utc(tz): - converted = vals.copy() + return vals elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) for i in range(n): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 4de7a5860c465..54a09a6d2ede7 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -136,7 +136,7 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, sum_x[0] = t -def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, +def roll_sum(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 @@ -240,7 +240,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[float64_t] values, ndarray[int64_t] start, +def roll_mean(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0 @@ -361,7 +361,7 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, +def roll_var(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int ddof=1): """ Numerically stable implementation using Welford's method. @@ -772,7 +772,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, +def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): # GH 32865. win argument kept for compatibility cdef: @@ -1032,7 +1032,7 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, +def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, float64_t quantile, str interpolation): """ @@ -1496,8 +1496,8 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, - int64_t halflife): +def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, + int minp, ndarray[int64_t] times, int64_t halflife): """ Compute exponentially-weighted moving average using halflife and time distances. @@ -1505,6 +1505,8 @@ def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, Parameters ---------- vals : ndarray[float_64] + start: ndarray[int_64] + end: ndarray[int_64] minp : int times : ndarray[int64] halflife : int64 @@ -1552,17 +1554,20 @@ def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, return output -def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp): +def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, + float64_t com, bint adjust, bint ignore_na): """ Compute exponentially-weighted moving average using center-of-mass. Parameters ---------- vals : ndarray (float64 type) + start: ndarray (int64 type) + end: ndarray (int64 type) + minp : int com : float64 adjust : int ignore_na : bool - minp : int Returns ------- @@ -1620,19 +1625,21 @@ def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp # Exponentially weighted moving covariance -def ewmcov(float64_t[:] input_x, float64_t[:] input_y, - float64_t com, bint adjust, bint ignore_na, int minp, bint bias): +def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, + float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias): """ Compute exponentially-weighted moving variance using center-of-mass. Parameters ---------- input_x : ndarray (float64 type) + start: ndarray (int64 type) + end: ndarray (int64 type) + minp : int input_y : ndarray (float64 type) com : float64 adjust : int ignore_na : bool - minp : int bias : int Returns diff --git a/pandas/_testing.py b/pandas/_testing.py index 5dcd1247e52ba..68371b782aac2 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -739,22 +739,29 @@ def assert_index_equal( obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. + + Examples + -------- + >>> from pandas.testing import assert_index_equal + >>> a = pd.Index([1, 2, 3]) + >>> b = pd.Index([1, 2, 3]) + >>> assert_index_equal(a, b) """ __tracebackhide__ = True - def _check_types(l, r, obj="Index"): + def _check_types(left, right, obj="Index"): if exact: - assert_class_equal(l, r, exact=exact, obj=obj) + assert_class_equal(left, right, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal("dtype", l, r, obj=obj) + assert_attr_equal("dtype", left, right, obj=obj) # allow string-like to have different inferred_types - if l.inferred_type in ("string"): - assert r.inferred_type in ("string") + if left.inferred_type in ("string"): + assert right.inferred_type in ("string") else: - assert_attr_equal("inferred_type", l, r, obj=obj) + assert_attr_equal("inferred_type", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only @@ -1140,9 +1147,9 @@ def _raise(left, right, err_msg): ) diff = 0 - for l, r in zip(left, right): + for left_arr, right_arr in zip(left, right): # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): + if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan): diff += 1 diff = diff * 100.0 / left.size @@ -1205,6 +1212,13 @@ def assert_extension_array_equal( Missing values are checked separately from valid values. A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. + + Examples + -------- + >>> from pandas.testing import assert_extension_array_equal + >>> a = pd.Series([1, 2, 3, 4]) + >>> b, c = a.array, a.array + >>> assert_extension_array_equal(b, c) """ if check_less_precise is not no_default: warnings.warn( @@ -1334,6 +1348,13 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. + + Examples + -------- + >>> from pandas.testing import assert_series_equal + >>> a = pd.Series([1, 2, 3, 4]) + >>> b = pd.Series([1, 2, 3, 4]) + >>> assert_series_equal(a, b) """ __tracebackhide__ = True @@ -1747,7 +1768,7 @@ def box_expected(expected, box_cls, transpose=True): elif box_cls is pd.DataFrame: expected = pd.Series(expected).to_frame() if transpose: - # for vector operations, we we need a DataFrame to be a single-row, + # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. expected = expected.T diff --git a/pandas/_version.py b/pandas/_version.py index d2df063ff3acf..14c2b5c6e7603 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -5,31 +5,36 @@ # that just contains the computed version number. # This file is released into the public domain. Generated by -# versioneer-0.15 (https://github.com/warner/python-versioneer) +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" import errno import os import re import subprocess import sys -from typing import Callable, Dict def get_keywords(): + """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" - return {"refnames": git_refnames, "full": git_full} + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords class VersioneerConfig: - pass + """Container for Versioneer configuration parameters.""" def get_config(): + """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() @@ -43,14 +48,17 @@ def get_config(): class NotThisMethod(Exception): - pass + """Exception raised if a method is not valid for the current scenario.""" + +HANDLERS = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" -def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator - def decorate(f: Callable) -> Callable: + def decorate(f): + """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f @@ -59,7 +67,8 @@ def decorate(f: Callable) -> Callable: return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: @@ -69,6 +78,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): p = subprocess.Popen( [c] + args, cwd=cwd, + env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), ) @@ -78,58 +88,77 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): if e.errno == errno.ENOENT: continue if verbose: - print(f"unable to run {dispcmd}") + print("unable to run %s" % dispcmd) print(e) - return None + return None, None else: if verbose: print(f"unable to find command, tried {commands}") - return None + return None, None stdout = p.communicate()[0].strip().decode() if p.returncode != 0: if verbose: - print(f"unable to run {dispcmd} (error)") - return None - return stdout + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): - # Source tarballs conventionally unpack into a directory that includes - # both the project name and a version string. - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print( - f"guessing rootdir is '{root}', but '{dirname}' " - f"doesn't start with prefix '{parentdir_prefix}'" - ) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - } + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: - with open(versionfile_abs) as fd: - for line in fd.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) + f = open(versionfile_abs) + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() except OSError: pass return keywords @@ -137,8 +166,22 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: @@ -159,20 +202,21 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print(f"discarding '{','.join(refs - tags)}', no digits") + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: - print(f"likely tags: {','.join(sorted(tags))}") + print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: - print(f"picking {r}") + print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, + "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: @@ -182,34 +226,48 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", + "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - # this runs 'git' from the root of the source tree. This only gets called - # if the git-archive 'subst' keywords were *not* expanded, and - # _version.py hasn't already been rewritten with a short version string, - # meaning we're inside a checked out source tree. - - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print(f"no .git in {root}") - raise NotThisMethod("no .git directory") + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - # if there is a tag, this yields TAG-NUM-gHEX[-dirty] - # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command( - GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() @@ -236,18 +294,20 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = f"unable to parse git-describe output: '{describe_out}'" + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - msg = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" if verbose: - print(msg) - pieces["error"] = msg + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag @@ -259,110 +319,129 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + return pieces def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): - # now build up version string, with post-release "local version - # identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - # get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + """Build up version string, with post-release "local version identifier". - # exceptions: - # 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) - rendered += f"{pieces['distance']:d}.g{pieces['short']}" + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" else: # exception #1 - rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" - if pieces["dirty"]: - rendered += ".dirty" + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" return rendered def render_pep440_pre(pieces): - # TAG[.post.devDISTANCE] . No -dirty - - # exceptions: - # 1: no tags. 0.post.devDISTANCE + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += f".post.dev{pieces['distance']:d}" + rendered += ".post0.dev%d" % pieces["distance"] else: # exception #1 - rendered = f"0.post.dev{pieces['distance']:d}" + rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): - # TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that - # .dev0 sorts backwards (a dirty tree will appear "older" than the - # corresponding clean one), but you shouldn't be releasing software with - # -dirty anyways. + """TAG[.postDISTANCE[.dev0]+gHEX] . - # exceptions: - # 1: no tags. 0.postDISTANCE[.dev0] + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += f".post{pieces['distance']:d}" + rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) - rendered += f"g{pieces['short']}" + rendered += "g%s" % pieces["short"] else: # exception #1 - rendered = f"0.pos{pieces['distance']:d}" + rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" - rendered += f"+g{pieces['short']}" + rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): - # TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. + """TAG[.postDISTANCE[.dev0]] . - # exceptions: - # 1: no tags. 0.postDISTANCE[.dev0] + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += f".post{pieces['distance']:d}" + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" else: # exception #1 - rendered = f"0.post{pieces['distance']:d}" - if pieces["dirty"]: - rendered += ".dev0" + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" return rendered def render_git_describe(pieces): - # TAG[-DISTANCE-gHEX][-dirty], like 'git describe --tags --dirty - # --always' + """TAG[-DISTANCE-gHEX][-dirty]. - # exceptions: - # 1: no tags. HEX[-dirty] (note: no 'g' prefix) + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += f"-{pieces['distance']:d}-g{pieces['short']}" + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] @@ -372,15 +451,17 @@ def render_git_describe(pieces): def render_git_describe_long(pieces): - # TAG-DISTANCE-gHEX[-dirty], like 'git describe --tags --dirty - # --always -long'. The distance/hash is unconditional. + """TAG-DISTANCE-gHEX[-dirty]. - # exceptions: - # 1: no tags. HEX[-dirty] (note: no 'g' prefix) + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] - rendered += f"-{pieces['distance']:d}-g{pieces['short']}" + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] @@ -390,12 +471,14 @@ def render_git_describe_long(pieces): def render(pieces, style): + """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], + "date": None, } if not style or style == "default": @@ -414,17 +497,19 @@ def render(pieces, style): elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: - raise ValueError(f"unknown style '{style}'") + raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date"), } def get_versions(): + """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which @@ -451,6 +536,7 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", + "date": None, } try: @@ -470,4 +556,5 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to compute version", + "date": None, } diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index d3c7888cac704..533e67acfa2f4 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -25,7 +25,7 @@ "sqlalchemy": "1.2.8", "tables": "3.5.1", "tabulate": "0.8.3", - "xarray": "0.12.0", + "xarray": "0.12.3", "xlrd": "1.2.0", "xlwt": "1.3.0", "xlsxwriter": "1.0.2", diff --git a/pandas/conftest.py b/pandas/conftest.py index b2daa2c5bc3f7..a0ec6f96042fc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -33,8 +33,10 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype + import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, Interval, Period, Series, Timedelta, Timestamp import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex @@ -470,8 +472,8 @@ def index_with_missing(request): if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() - vals[0] = tuple([None]) + vals[0][1:] - vals[-1] = tuple([None]) + vals[-1][1:] + vals[0] = (None,) + vals[0][1:] + vals[-1] = (None,) + vals[-1][1:] return MultiIndex.from_tuples(vals) else: vals[0] = None @@ -687,6 +689,26 @@ def float_frame(): return DataFrame(tm.getSeriesData()) +# ---------------------------------------------------------------- +# Scalars +# ---------------------------------------------------------------- +@pytest.fixture( + params=[ + (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0.1, right=0.5), IntervalDtype("float64")), + (Period("2012-01", freq="M"), "period[M]"), + (Period("2012-02-01", freq="D"), "period[D]"), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + (Timedelta(seconds=500), "timedelta64[ns]"), + ] +) +def ea_scalar_and_dtype(request): + return request.param + + # ---------------------------------------------------------------- # Operators & Operations # ---------------------------------------------------------------- @@ -1143,6 +1165,26 @@ def any_nullable_int_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) +def any_numeric_dtype(request): + """ + Parameterized fixture for any nullable integer dtype and + any float ea dtypes. + + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + * 'Float32' + * 'Float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_EA_INT_DTYPES) def any_signed_nullable_int_dtype(request): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ca88163801239..a22058c40c89b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -49,9 +49,9 @@ ) from pandas.core.dtypes.generic import ( ABCExtensionArray, - ABCIndex, ABCIndexClass, ABCMultiIndex, + ABCRangeIndex, ABCSeries, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Series + from pandas import Categorical, DataFrame, Index, Series _shared_docs: Dict[str, str] = {} @@ -69,7 +69,7 @@ # dtype access # # --------------- # def _ensure_data( - values, dtype: Optional[DtypeObj] = None + values: ArrayLike, dtype: Optional[DtypeObj] = None ) -> Tuple[np.ndarray, DtypeObj]: """ routine to ensure that our data is of the correct @@ -95,6 +95,12 @@ def _ensure_data( pandas_dtype : np.dtype or ExtensionDtype """ + if dtype is not None: + # We only have non-None dtype when called from `isin`, and + # both Datetimelike and Categorical dispatch before getting here. + assert not needs_i8_conversion(dtype) + assert not is_categorical_dtype(dtype) + if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) @@ -126,21 +132,20 @@ def _ensure_data( return ensure_object(values), np.dtype("object") # datetimelike - vals_dtype = getattr(values, "dtype", None) - if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): - if is_period_dtype(vals_dtype) or is_period_dtype(dtype): + if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): + if is_period_dtype(values.dtype) or is_period_dtype(dtype): from pandas import PeriodIndex - values = PeriodIndex(values) + values = PeriodIndex(values)._data dtype = values.dtype - elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex - values = TimedeltaIndex(values) + values = TimedeltaIndex(values)._data dtype = values.dtype else: # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): + if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs @@ -150,12 +155,12 @@ def _ensure_data( from pandas import DatetimeIndex - values = DatetimeIndex(values) + values = DatetimeIndex(values)._data dtype = values.dtype return values.asi8, dtype - elif is_categorical_dtype(vals_dtype) and ( + elif is_categorical_dtype(values.dtype) and ( is_categorical_dtype(dtype) or dtype is None ): values = values.codes @@ -213,7 +218,8 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string"]: + if inferred in ["mixed", "string", "mixed-integer"]: + # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -231,11 +237,11 @@ def _ensure_arraylike(values): } -def _get_hashtable_algo(values): +def _get_hashtable_algo(values: np.ndarray): """ Parameters ---------- - values : arraylike + values : np.ndarray Returns ------- @@ -249,15 +255,15 @@ def _get_hashtable_algo(values): return htable, values -def _get_values_for_rank(values): +def _get_values_for_rank(values: ArrayLike): if is_categorical_dtype(values): - values = values._values_for_rank() + values = cast("Categorical", values)._values_for_rank() values, _ = _ensure_data(values) return values -def get_data_algo(values): +def get_data_algo(values: ArrayLike): values = _get_values_for_rank(values) ndtype = _check_object_for_strings(values) @@ -415,32 +421,46 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + if not isinstance( + values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray) + ): values = construct_1d_object_array_from_listlike(list(values)) # TODO: could use ensure_arraylike here + elif isinstance(values, ABCMultiIndex): + # Avoid raising in extract_array + values = np.array(values) + comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_categorical_dtype(comps): + if is_categorical_dtype(comps.dtype): # TODO(extension) # handle categoricals return cast("Categorical", comps).isin(values) + if needs_i8_conversion(comps.dtype): + # Dispatch to DatetimeLikeArrayMixin.isin + return array(comps).isin(values) + elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): + # e.g. comps are integers and values are datetime64s + return np.zeros(comps.shape, dtype=bool) + comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) - # faster for larger cases to use np.in1d f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - if len(comps) > 1_000_000 and not is_object_dtype(comps): - # If the the values include nan we need to check for nan explicitly + # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), + # in1d is faster for small sizes + if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): + # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d - elif is_integer_dtype(comps): + elif is_integer_dtype(comps.dtype): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) @@ -449,7 +469,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = values.astype(object) comps = comps.astype(object) - elif is_float_dtype(comps): + elif is_float_dtype(comps.dtype): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False) @@ -462,7 +482,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None + values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -528,7 +548,7 @@ def factorize( sort: bool = False, na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, -) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: +) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]: """ Encode the object as an enumerated type or categorical variable. @@ -658,7 +678,9 @@ def factorize( na_sentinel = -1 dropna = False - if is_extension_array_dtype(values.dtype): + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) + elif is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype @@ -826,7 +848,7 @@ def value_counts_arraylike(values, dropna: bool): return keys, counts -def duplicated(values, keep="first") -> np.ndarray: +def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -1539,7 +1561,7 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. @@ -1777,7 +1799,7 @@ def func(arr, indexer, out, fill_value=np.nan): # ------------ # -def searchsorted(arr, value, side="left", sorter=None): +def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: """ Find indices where elements should be inserted to maintain order. @@ -1826,7 +1848,7 @@ def searchsorted(arr, value, side="left", sorter=None): if ( isinstance(arr, np.ndarray) - and is_integer_dtype(arr) + and is_integer_dtype(arr.dtype) and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be @@ -2144,3 +2166,24 @@ def _sort_tuples(values: np.ndarray[tuple]): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] + + +def make_duplicates_of_left_unique_in_right( + left: np.ndarray, right: np.ndarray +) -> np.ndarray: + """ + If left has duplicates, which are also duplicated in right, this duplicated values + are dropped from right, meaning that every duplicate value from left exists only + once in right. + + Parameters + ---------- + left: ndarray + right: ndarray + + Returns + ------- + Duplicates of left are unique in right + """ + left_duplicates = unique(left[duplicated(left)]) + return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fa4fbe711fbe4..c5260deafc0c3 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -26,7 +26,6 @@ def frame_apply( axis: Axis = 0, raw: bool = False, result_type: Optional[str] = None, - ignore_failures: bool = False, args=None, kwds=None, ): @@ -43,7 +42,6 @@ def frame_apply( func, raw=raw, result_type=result_type, - ignore_failures=ignore_failures, args=args, kwds=kwds, ) @@ -84,13 +82,11 @@ def __init__( func, raw: bool, result_type: Optional[str], - ignore_failures: bool, args, kwds, ): self.obj = obj self.raw = raw - self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} @@ -283,29 +279,14 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]: results = {} - if self.ignore_failures: - successes = [] + with option_context("mode.chained_assignment", None): for i, v in enumerate(series_gen): - try: - results[i] = self.f(v) - except Exception: - pass - else: - successes.append(i) - - # so will work with MultiIndex - if len(successes) < len(res_index): - res_index = res_index.take(successes) - - else: - with option_context("mode.chained_assignment", None): - for i, v in enumerate(series_gen): - # ignore SettingWithCopy here in case the user mutates - results[i] = self.f(v) - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index da366c9abf0a4..6b28f8f135769 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -5,8 +5,15 @@ ExtensionArray """ import operator +from typing import Any, Callable +import warnings -from pandas.core.ops import roperator +import numpy as np + +from pandas._libs import lib + +from pandas.core.construction import extract_array +from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator from pandas.core.ops.common import unpack_zerodim_and_defer @@ -140,3 +147,138 @@ def __pow__(self, other): @unpack_zerodim_and_defer("__rpow__") def __rpow__(self, other): return self._arith_method(other, roperator.rpow) + + +def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + Compatibility with numpy ufuncs. + + See also + -------- + numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ + """ + from pandas.core.generic import NDFrame + from pandas.core.internals import BlockManager + + cls = type(self) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + + if len(set(types)) > 1: + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + "Cannot apply ufunc {} to mixed DataFrame and Series " + "inputs.".format(ufunc) + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1.union(ax2) + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if lib.is_scalar(result): + return result + if result.ndim != self.ndim: + if method == "outer": + if self.ndim == 2: + # we already deprecated for Series + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the DataFrame " + "to an array with '.to_numpy()' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + return result + raise NotImplementedError + return result + if isinstance(result, BlockManager): + # we went through BlockManager.apply + result = self._constructor(result, **reconstruct_kwargs, copy=False) + else: + # we converted an array, lost our axes + result = self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__fianlize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + if len(alignable) == 1: + result = result.__finalize__(self) + return result + + if self.ndim > 1 and ( + len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] + ): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*inputs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + + if ufunc.nout > 1: # type: ignore[attr-defined] + result = tuple(reconstruct(x) for x in result) + else: + result = reconstruct(result) + return result diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 7eaadecbd6491..5cc6525dc3c9b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,4 +1,6 @@ -from typing import Any, Optional, Sequence, Type, TypeVar +from __future__ import annotations + +from typing import Any, Optional, Sequence, Type, TypeVar, Union import numpy as np @@ -212,7 +214,9 @@ def __setitem__(self, key, value): def _validate_setitem_value(self, value): return value - def __getitem__(self, key): + def __getitem__( + self: NDArrayBackedExtensionArrayT, key: Union[int, slice, np.ndarray] + ) -> Union[NDArrayBackedExtensionArrayT, Any]: if lib.is_integer(key): # fast-path result = self._ndarray[key] @@ -296,3 +300,43 @@ def __repr__(self) -> str: data = ",\n".join(lines) class_name = f"<{type(self).__name__}>" return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + + # ------------------------------------------------------------------------ + # __array_function__ methods + + def putmask(self, mask, value): + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + np.putmask(self._ndarray, mask, value) + + def where(self, mask, value): + """ + Analogue to np.where(mask, self, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + res_values = np.where(mask, self._ndarray, value) + return self._from_backing_data(res_values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0968545a6b8a4..448025e05422d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,6 +6,8 @@ This is an experimental API and subject to breaking changes without warning. """ +from __future__ import annotations + import operator from typing import ( Any, @@ -254,8 +256,9 @@ def _from_factorized(cls, values, original): # Must be a Sequence # ------------------------------------------------------------------------ - def __getitem__(self, item): - # type (Any) -> Any + def __getitem__( + self, item: Union[int, slice, np.ndarray] + ) -> Union[ExtensionArray, Any]: """ Select a subset of self. @@ -468,6 +471,7 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): @@ -475,7 +479,11 @@ def astype(self, dtype, copy=True): return self else: return self.copy() - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -661,7 +669,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. @@ -831,7 +839,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -940,7 +948,7 @@ def take( *, allow_fill: bool = False, fill_value: Any = None, - ) -> "ExtensionArray": + ) -> ExtensionArray: """ Take elements from an array. @@ -1109,7 +1117,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def transpose(self, *axes) -> "ExtensionArray": + def transpose(self, *axes) -> ExtensionArray: """ Return a transposed view on this array. @@ -1119,10 +1127,10 @@ def transpose(self, *axes) -> "ExtensionArray": return self[:] @property - def T(self) -> "ExtensionArray": + def T(self) -> ExtensionArray: return self.transpose() - def ravel(self, order="C") -> "ExtensionArray": + def ravel(self, order="C") -> ExtensionArray: """ Return a flattened view on this array. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 67818e6cf8fae..fe66aae23f510 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -10,6 +10,7 @@ from pandas._config import get_option from pandas._libs import NaT, algos as libalgos, hashtable as htable +from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly, deprecate_kwarg @@ -76,7 +77,7 @@ def func(self, other): "Unordered Categoricals can only compare equality or not" ) if isinstance(other, Categorical): - # Two Categoricals can only be be compared if the categories are + # Two Categoricals can only be compared if the categories are # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." @@ -402,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: If copy is set to False and dtype is categorical, the original object is returned. """ - if is_categorical_dtype(dtype): + if self.dtype is dtype: + result = self.copy() if copy else self + + elif is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) - # GH 10696/18593 + # GH 10696/18593/18630 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - if dtype == self.dtype: - return self - return self._set_dtype(dtype) - if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) - if is_integer_dtype(dtype) and self.isna().any(): + result = self._set_dtype(dtype) + + # TODO: consolidate with ndarray case? + elif is_extension_array_dtype(dtype): + result = array(self, dtype=dtype, copy=copy) + + elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - return np.array(self, dtype=dtype, copy=copy) + + elif len(self.codes) == 0 or len(self.categories) == 0: + result = np.array(self, dtype=dtype, copy=copy) + + else: + # GH8628 (PERF): astype category codes instead of astyping array + try: + astyped_cats = self.categories.astype(dtype=dtype, copy=copy) + except ( + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, + ): + msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" + raise ValueError(msg) + + astyped_cats = extract_array(astyped_cats, extract_numpy=True) + result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) + + return result @cache_readonly def itemsize(self) -> int: @@ -1046,7 +1069,7 @@ def remove_categories(self, removals, inplace=False): new_categories, ordered=self.ordered, rename=False, inplace=inplace ) - def remove_unused_categories(self, inplace=False): + def remove_unused_categories(self, inplace=no_default): """ Remove categories which are not used. @@ -1056,6 +1079,8 @@ def remove_unused_categories(self, inplace=False): Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. + .. deprecated:: 1.2.0 + Returns ------- cat : Categorical or None @@ -1069,6 +1094,17 @@ def remove_unused_categories(self, inplace=False): remove_categories : Remove the specified categories. set_categories : Set the categories to the specified ones. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "remove_unused_categories is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1920,6 +1956,7 @@ def min(self, *, skipna=True, **kwargs): ------- min : the minimum of this `Categorical` """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_min((), kwargs) self.check_for_ordered("min") @@ -1956,6 +1993,7 @@ def max(self, *, skipna=True, **kwargs): ------- max : the maximum of this `Categorical` """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_max((), kwargs) self.check_for_ordered("max") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0ce32fcd822e0..8fa2c734092f4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import datetime, timedelta import operator from typing import ( @@ -60,7 +62,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import nanops, ops -from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts +from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com @@ -99,6 +101,8 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _generate_range """ + # _infer_matches -> which infer_dtype strings are close enough to our own + _infer_matches: Tuple[str, ...] _is_recognized_dtype: Callable[[DtypeObj], bool] _recognized_scalars: Tuple[Type, ...] _data: np.ndarray @@ -264,7 +268,9 @@ def __array__(self, dtype=None) -> np.ndarray: return np.array(list(self), dtype=object) return self._ndarray - def __getitem__(self, key): + def __getitem__( + self, key: Union[int, slice, np.ndarray] + ) -> Union[DatetimeLikeArrayMixin, DTScalarOrNaT]: """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars @@ -693,6 +699,59 @@ def map(self, mapper): return Index(self).map(mapper).array + def isin(self, values) -> np.ndarray: + """ + Compute boolean array of whether each value is found in the + passed set of values. + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + ndarray[bool] + """ + if not hasattr(values, "dtype"): + values = np.asarray(values) + + if values.dtype.kind in ["f", "i", "u", "c"]: + # TODO: de-duplicate with equals, validate_comparison_value + return np.zeros(self.shape, dtype=bool) + + if not isinstance(values, type(self)): + inferrable = [ + "timedelta", + "timedelta64", + "datetime", + "datetime64", + "date", + "period", + ] + if values.dtype == object: + inferred = lib.infer_dtype(values, skipna=False) + if inferred not in inferrable: + if inferred == "string": + pass + + elif "mixed" in inferred: + return isin(self.astype(object), values) + else: + return np.zeros(self.shape, dtype=bool) + + try: + values = type(self)._from_sequence(values) + except ValueError: + return isin(self.astype(object), values) + + try: + self._check_compatible_with(values) + except (TypeError, ValueError): + # Includes tzawareness mismatch and IncompatibleFrequencyError + return np.zeros(self.shape, dtype=bool) + + return isin(self.asi8, values.asi8) + # ------------------------------------------------------------------ # Null Handling @@ -1554,6 +1613,9 @@ def ceil(self, freq, ambiguous="raise", nonexistent="raise"): # -------------------------------------------------------------- # Frequency Methods + def _maybe_clear_freq(self): + self._freq = None + def _with_freq(self, freq): """ Helper to get a view on the same data, with a new freq. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a05dc717f83c1..ce70f929cc79d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,5 +1,5 @@ from datetime import datetime, time, timedelta, tzinfo -from typing import Optional, Union +from typing import Optional, Union, cast import warnings import numpy as np @@ -154,6 +154,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _scalar_type = Timestamp _recognized_scalars = (datetime, np.datetime64) _is_recognized_dtype = is_datetime64_any_dtype + _infer_matches = ("datetime", "datetime64", "date") # define my properties & methods for delegation _bool_ops = [ @@ -444,9 +445,11 @@ def _generate_range( ) if not left_closed and len(index) and index[0] == start: - index = index[1:] + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + index = cast(DatetimeArray, index[1:]) if not right_closed and len(index) and index[-1] == end: - index = index[:-1] + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + index = cast(DatetimeArray, index[:-1]) dtype = tz_to_dtype(tz) return cls._simple_new(index.asi8, freq=freq, dtype=dtype) @@ -474,9 +477,6 @@ def _check_compatible_with(self, other, setitem: bool = False): if not timezones.tz_compare(self.tz, other.tz): raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") - def _maybe_clear_freq(self): - self._freq = None - # ----------------------------------------------------------------- # Descriptive Properties diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index a5ebdd8d963e2..4aed39d7edb92 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -120,7 +120,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is floating numpy array, preserve it's dtype + # if values is floating numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c9d7632e39228..2897c18acfb09 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -183,7 +183,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is integer numpy array, preserve it's dtype + # if values is integer numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d007bb112c86c..efb66c9a47a97 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,7 +1,7 @@ import operator from operator import le, lt import textwrap -from typing import TYPE_CHECKING, Optional, Sequence, Tuple, Type, TypeVar, Union, cast +from typing import Sequence, Type, TypeVar import numpy as np @@ -14,7 +14,6 @@ intervals_to_interval_bounds, ) from pandas._libs.missing import NA -from pandas._typing import ArrayLike, Dtype from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -22,9 +21,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_any_dtype, - is_dtype_equal, is_float_dtype, - is_integer, is_integer_dtype, is_interval_dtype, is_list_like, @@ -52,10 +49,6 @@ from pandas.core.indexes.base import ensure_index from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer -if TYPE_CHECKING: - from pandas import Index - from pandas.core.arrays import DatetimeArray, TimedeltaArray - IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") _interval_shared_docs = {} @@ -182,17 +175,6 @@ def __new__( left = data._left right = data._right closed = closed or data.closed - - if dtype is None or data.dtype == dtype: - # This path will preserve id(result._combined) - # TODO: could also validate dtype before going to simple_new - combined = data._combined - if copy: - combined = combined.copy() - result = cls._simple_new(combined, closed=closed) - if verify_integrity: - result._validate() - return result else: # don't allow scalars @@ -210,22 +192,83 @@ def __new__( ) closed = closed or infer_closed - closed = closed or "right" - left, right = _maybe_cast_inputs(left, right, copy, dtype) - combined = _get_combined_data(left, right) - result = cls._simple_new(combined, closed=closed) - if verify_integrity: - result._validate() - return result + return cls._simple_new( + left, + right, + closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) @classmethod - def _simple_new(cls, data, closed="right"): + def _simple_new( + cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True + ): result = IntervalMixin.__new__(cls) - result._combined = data - result._left = data[:, 0] - result._right = data[:, 1] + closed = closed or "right" + left = ensure_index(left, copy=copy) + right = ensure_index(right, copy=copy) + + if dtype is not None: + # GH 19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) + elif dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" + ) + raise ValueError(msg) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = "Period dtypes are not supported, use a PeriodIndex instead" + raise ValueError(msg) + elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ( + "left and right must have the same time zone, got " + f"'{left.tz}' and '{right.tz}'" + ) + raise ValueError(msg) + + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) + left = extract_array(left, extract_numpy=True) + right = maybe_upcast_datetimelike_array(right) + right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + + result._left = left + result._right = right result._closed = closed + if verify_integrity: + result._validate() return result @classmethod @@ -360,16 +403,10 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) - if len(left) != len(right): - raise ValueError("left and right must have the same length") - - closed = closed or "right" - left, right = _maybe_cast_inputs(left, right, copy, dtype) - combined = _get_combined_data(left, right) - result = cls._simple_new(combined, closed) - result._validate() - return result + return cls._simple_new( + left, right, closed, copy=copy, dtype=dtype, verify_integrity=True + ) _interval_shared_docs["from_tuples"] = textwrap.dedent( """ @@ -475,6 +512,19 @@ def _validate(self): msg = "left side of interval must be <= right side" raise ValueError(msg) + def _shallow_copy(self, left, right): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : Index + Values to be used for the left-side of the intervals. + right : Index + Values to be used for the right-side of the intervals. + """ + return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + # --------------------------------------------------------------------- # Descriptive @@ -502,20 +552,18 @@ def __len__(self) -> int: def __getitem__(self, key): key = check_array_indexer(self, key) + left = self._left[key] + right = self._right[key] - result = self._combined[key] - - if is_integer(key): - left, right = result[0], result[1] - if isna(left): + if not isinstance(left, (np.ndarray, ExtensionArray)): + # scalar + if is_scalar(left) and isna(left): return self._fill_value return Interval(left, right, self.closed) - - # TODO: need to watch out for incorrectly-reducing getitem - if np.ndim(result) > 2: + if np.ndim(left) > 1: # GH#30588 multi-dimensional indexer disallowed raise ValueError("multi-dimensional indexing not allowed") - return type(self)._simple_new(result, closed=self.closed) + return self._shallow_copy(left, right) def __setitem__(self, key, value): value_left, value_right = self._validate_setitem_value(value) @@ -619,6 +667,24 @@ def __lt__(self, other): def __le__(self, other): return self._cmp_method(other, operator.le) + def argsort( + self, + ascending: bool = True, + kind: str = "quicksort", + na_position: str = "last", + *args, + **kwargs, + ) -> np.ndarray: + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + + if ascending and kind == "quicksort" and na_position == "last": + return np.lexsort((self.right, self.left)) + + # TODO: other cases we can use lexsort for? much more performant. + return super().argsort( + ascending=ascending, kind=kind, na_position=na_position, **kwargs + ) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -655,8 +721,7 @@ def fillna(self, value=None, method=None, limit=None): left = self.left.fillna(value=value_left) right = self.right.fillna(value=value_right) - combined = _get_combined_data(left, right) - return type(self)._simple_new(combined, closed=self.closed) + return self._shallow_copy(left, right) def astype(self, dtype, copy=True): """ @@ -698,11 +763,9 @@ def astype(self, dtype, copy=True): f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) raise TypeError(msg) from err - # TODO: do astype directly on self._combined - combined = _get_combined_data(new_left, new_right) - return type(self)._simple_new(combined, closed=self.closed) + return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): - return Categorical(np.asarray(self)) + return Categorical(np.asarray(self), dtype=dtype) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -743,11 +806,9 @@ def _concat_same_type( raise ValueError("Intervals must all be closed on the same side.") closed = closed.pop() - # TODO: will this mess up on dt64tz? left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) - combined = _get_combined_data(left, right) # TODO: 1-stage concat - return cls._simple_new(combined, closed=closed) + return cls._simple_new(left, right, closed=closed, copy=False) def copy(self: IntervalArrayT) -> IntervalArrayT: """ @@ -757,8 +818,11 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: ------- IntervalArray """ - combined = self._combined.copy() - return type(self)._simple_new(combined, closed=self.closed) + left = self._left.copy() + right = self._right.copy() + closed = self.closed + # TODO: Could skip verify_integrity here. + return type(self).from_arrays(left, right, closed=closed) def isna(self) -> np.ndarray: return isna(self._left) @@ -851,8 +915,7 @@ def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwarg self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) - combined = _get_combined_data(left_take, right_take) - return type(self)._simple_new(combined, closed=self.closed) + return self._shallow_copy(left_take, right_take) def _validate_listlike(self, value): # list-like of intervals @@ -1165,7 +1228,10 @@ def set_closed(self, closed): if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) - return type(self)._simple_new(self._combined, closed=closed) + + return type(self)._simple_new( + left=self._left, right=self._right, closed=closed, verify_integrity=False + ) _interval_shared_docs[ "is_non_overlapping_monotonic" @@ -1306,8 +1372,9 @@ def to_tuples(self, na_tuple=True): @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) - combined = self._combined.repeat(repeats, 0) - return type(self)._simple_new(combined, closed=self.closed) + left_repeat = self.left.repeat(repeats) + right_repeat = self.right.repeat(repeats) + return self._shallow_copy(left=left_repeat, right=right_repeat) _interval_shared_docs["contains"] = textwrap.dedent( """ @@ -1390,101 +1457,3 @@ def maybe_convert_platform_interval(values): values = np.asarray(values) return maybe_convert_platform(values) - - -def _maybe_cast_inputs( - left_orig: Union["Index", ArrayLike], - right_orig: Union["Index", ArrayLike], - copy: bool, - dtype: Optional[Dtype], -) -> Tuple["Index", "Index"]: - left = ensure_index(left_orig, copy=copy) - right = ensure_index(right_orig, copy=copy) - - if dtype is not None: - # GH#19262: dtype must be an IntervalDtype to override inferred - dtype = pandas_dtype(dtype) - if not is_interval_dtype(dtype): - msg = f"dtype must be an IntervalDtype, got {dtype}" - raise TypeError(msg) - dtype = cast(IntervalDtype, dtype) - if dtype.subtype is not None: - left = left.astype(dtype.subtype) - right = right.astype(dtype.subtype) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ( - f"must not have differing left [{type(left).__name__}] and " - f"right [{type(right).__name__}] types" - ) - raise ValueError(msg) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH#19016 - msg = ( - "category, object, and string subtypes are not supported " - "for IntervalArray" - ) - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = "Period dtypes are not supported, use a PeriodIndex instead" - raise ValueError(msg) - elif isinstance(left, ABCDatetimeIndex) and not is_dtype_equal( - left.dtype, right.dtype - ): - left_arr = cast("DatetimeArray", left._data) - right_arr = cast("DatetimeArray", right._data) - msg = ( - "left and right must have the same time zone, got " - f"'{left_arr.tz}' and '{right_arr.tz}'" - ) - raise ValueError(msg) - - return left, right - - -def _get_combined_data( - left: Union["Index", ArrayLike], right: Union["Index", ArrayLike] -) -> Union[np.ndarray, "DatetimeArray", "TimedeltaArray"]: - # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array - - left = maybe_upcast_datetimelike_array(left) - left = extract_array(left, extract_numpy=True) - right = maybe_upcast_datetimelike_array(right) - right = extract_array(right, extract_numpy=True) - - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() - - if isinstance(left, np.ndarray): - assert isinstance(right, np.ndarray) # for mypy - combined = np.concatenate( - [left.reshape(-1, 1), right.reshape(-1, 1)], - axis=1, - ) - else: - # error: Item "type" of "Union[Type[Index], Type[ExtensionArray]]" has - # no attribute "_concat_same_type" [union-attr] - - # error: Unexpected keyword argument "axis" for "_concat_same_type" of - # "ExtensionArray" [call-arg] - - # error: Item "Index" of "Union[Index, ExtensionArray]" has no - # attribute "reshape" [union-attr] - - # error: Item "ExtensionArray" of "Union[Index, ExtensionArray]" has no - # attribute "reshape" [union-attr] - combined = type(left)._concat_same_type( # type: ignore[union-attr,call-arg] - [left.reshape(-1, 1), right.reshape(-1, 1)], # type: ignore[union-attr] - axis=1, - ) - return combined diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index a4b88427ceb05..caed932cd7857 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1,4 +1,6 @@ -from typing import TYPE_CHECKING, Optional, Sequence, Tuple, Type, TypeVar +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union import numpy as np @@ -56,7 +58,7 @@ def itemsize(self) -> int: return self.numpy_dtype.itemsize @classmethod - def construct_array_type(cls) -> Type["BaseMaskedArray"]: + def construct_array_type(cls) -> Type[BaseMaskedArray]: """ Return the array type associated with this dtype. @@ -100,7 +102,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) - def __getitem__(self, item): + def __getitem__( + self, item: Union[int, slice, np.ndarray] + ) -> Union[BaseMaskedArray, Any]: if is_integer(item): if self._mask[item]: return self.dtype.na_value diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0cdce1eabccc6..4eb67dcd12728 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -144,7 +144,7 @@ class PandasArray( # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for - # that _typ to ensure that that users don't unnecessarily use EAs inside + # that _typ to ensure that users don't unnecessarily use EAs inside # pandas internals, which turns off things like block consolidation. _typ = "npy_extension" __array_priority__ = 1000 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 80882acceb56a..50ed526cf01e9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -124,6 +124,7 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): _scalar_type = Period _recognized_scalars = (Period,) _is_recognized_dtype = is_period_dtype + _infer_matches = ("period",) # Names others delegate to us _other_ops: List[str] = [] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b297e7c2b13b..e75305e55348c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -282,10 +282,6 @@ def __setitem__(self, key, value): super().__setitem__(key, value) - def fillna(self, value=None, method=None, limit=None): - # TODO: validate dtype - return super().fillna(value, method, limit) - def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, StringDtype): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..184fbc050036b --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,625 @@ +from __future__ import annotations + +from distutils.version import LooseVersion +from typing import TYPE_CHECKING, Any, Sequence, Type, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer, validate_indices +from pandas.core.missing import get_fill_func + +try: + import pyarrow as pa +except ImportError: + pa = None +else: + # our min supported version of pyarrow, 0.15.1, does not have a compute + # module + try: + import pyarrow.compute as pc + except ImportError: + pass + else: + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + + +if TYPE_CHECKING: + from pandas import Series + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> from pandas.core.arrays.string_arrow import ArrowStringDtype + >>> ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(OpsMixin, ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + _dtype = ArrowStringDtype() + + def __init__(self, values): + self._chk_pyarrow_available() + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + if not pa.types.is_string(self._data.type): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + + @classmethod + def _chk_pyarrow_available(cls) -> None: + # TODO: maybe update import_optional_dependency to allow a minimum + # version to be specified rather than use the global minimum + if pa is None or LooseVersion(pa.__version__) < "1.0.0": + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + cls._chk_pyarrow_available() + # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value + scalars = lib.ensure_string_array(scalars, copy=False) + return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return self._dtype + + def __array__(self, dtype=None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + """ + # TODO: copy argument is ignored + + if na_value is lib.no_default: + na_value = self._dtype.na_value + result = self._data.__array__(dtype=dtype) + result[isna(result)] = na_value + return result + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + @classmethod + def _from_factorized(cls, values, original): + return cls._from_sequence(values) + + @classmethod + def _concat_same_type(cls, to_concat) -> ArrowStringArray: + """ + Concatenate multiple ArrowStringArray. + + Parameters + ---------- + to_concat : sequence of ArrowStringArray + + Returns + ------- + ArrowStringArray + """ + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea._data.iterchunks()] + ) + ) + + def __getitem__(self, item: Any) -> Any: + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return self._as_pandas_scalar(value) + + def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): + scalar = arrow_scalar.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + """ + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = get_fill_func(method) + new_values = func(self.to_numpy(object), limit=limit, mask=mask) + new_values = self._from_sequence(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self._data.is_null().to_pandas().values + + def copy(self) -> ArrowStringArray: + """ + Return a shallow copy of the array. + + Returns + ------- + ArrowStringArray + """ + return type(self)(self._data) + + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray + + pc_func = ARROW_CMP_FUNCS[op.__name__] + if isinstance(other, ArrowStringArray): + result = pc_func(self._data, other._data) + elif isinstance(other, np.ndarray): + result = pc_func(self._data, other) + elif is_scalar(other): + try: + result = pc_func(self._data, pa.scalar(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + result[valid] = op(np.array(self)[valid], other) + return BooleanArray(result, mask) + else: + return NotImplemented + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return BooleanArray._from_sequence(result.to_pandas().values) + + def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self._data[0:key].chunks, + pa.array([value], type=pa.string()), + *self._data[(key + 1) :].chunks, + ] + self._data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + + vc = self._data.value_counts() + + # Index cannot hold ExtensionArrays yet + index = Index(type(self)(vc.field(0)).astype(object)) + # No missings, so we can adhere to the interface and return a numpy array. + counts = np.array(vc.field(1)) + + if dropna and self._data.null_count > 0: + raise NotImplementedError("yo") + + return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index d9ecbc874cd59..998117cc49d50 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -104,6 +104,7 @@ class TimedeltaArray(dtl.TimelikeOps): _scalar_type = Timedelta _recognized_scalars = (timedelta, np.timedelta64, Tick) _is_recognized_dtype = is_timedelta64_dtype + _infer_matches = ("timedelta", "timedelta64") __array_priority__ = 1000 # define my properties & methods for delegation @@ -313,9 +314,6 @@ def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass - def _maybe_clear_freq(self): - self._freq = None - # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods diff --git a/pandas/core/base.py b/pandas/core/base.py index b3366cca37617..5f724d9e89d05 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -269,12 +269,14 @@ def __getitem__(self, key): return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): - if key not in self.obj.columns: + # error: "SelectionMixin" has no attribute "obj" [attr-defined] + if key not in self.obj.columns: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: - if key not in self.obj: + # error: "SelectionMixin" has no attribute "obj" [attr-defined] + if key not in self.obj: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) @@ -919,10 +921,9 @@ def _map_values(self, mapper, na_action=None): # "astype" [attr-defined] values = self.astype(object)._values # type: ignore[attr-defined] if na_action == "ignore": - - def map_f(values, f): - return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) - + map_f = lambda values, f: lib.map_infer_mask( + values, f, isna(values).view(np.uint8) + ) elif na_action is None: map_f = lib.map_infer else: diff --git a/pandas/core/common.py b/pandas/core/common.py index d5c078b817ca0..cdcbc43055052 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,12 +24,7 @@ is_extension_array_dtype, is_integer, ) -from pandas.core.dtypes.generic import ( - ABCExtensionArray, - ABCIndex, - ABCIndexClass, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -42,13 +37,13 @@ class SettingWithCopyWarning(Warning): pass -def flatten(l): +def flatten(line): """ Flatten an arbitrarily nested sequence. Parameters ---------- - l : sequence + line : sequence The non string sequence to flatten Notes @@ -59,11 +54,11 @@ def flatten(l): ------- flattened : generator """ - for el in l: - if iterable_not_string(el): - yield from flatten(el) + for element in line: + if iterable_not_string(element): + yield from flatten(element) else: - yield el + yield element def consensus_name_attr(objs): @@ -105,7 +100,7 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + if isinstance(key, (ABCSeries, np.ndarray, ABCIndexClass)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: @@ -282,20 +277,23 @@ def is_null_slice(obj) -> bool: ) -def is_true_slices(l): +def is_true_slices(line): """ - Find non-trivial slices in "l": return a list of booleans with same length. + Find non-trivial slices in "line": return a list of booleans with same length. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in l] + return [isinstance(k, slice) and not is_null_slice(k) for k in line] # TODO: used only once in indexing; belongs elsewhere? -def is_full_slice(obj, l) -> bool: +def is_full_slice(obj, line) -> bool: """ We have a full length slice. """ return ( - isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None + isinstance(obj, slice) + and obj.start == 0 + and obj.stop == line + and obj.step is None ) @@ -468,8 +466,11 @@ def convert_to_list_like( Convert list-like or scalar input to list-like. List, numpy and pandas array-like inputs are returned unmodified whereas others are converted to list. """ - if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): - return values + if isinstance( + values, (list, np.ndarray, ABCIndexClass, ABCSeries, ABCExtensionArray) + ): + # np.ndarray resolving as Any gives a false positive + return values # type: ignore[return-value] elif isinstance(values, abc.Iterable) and not isinstance(values, str): return list(values) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 8a8b0d564ea49..5ad3e78a76866 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,9 +1,10 @@ """ Core eval alignment algorithms. """ +from __future__ import annotations from functools import partial, wraps -from typing import Dict, Optional, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np @@ -17,13 +18,16 @@ import pandas.core.common as com from pandas.core.computation.common import result_type_many +if TYPE_CHECKING: + from pandas.core.indexes.api import Index + def _align_core_single_unary_op( term, -) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, Index]]]: typ: Union[partial, Type[FrameOrSeries]] - axes: Optional[Dict[str, int]] = None + axes: Optional[Dict[str, Index]] = None if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) @@ -36,8 +40,8 @@ def _align_core_single_unary_op( def _zip_axes_from_type( - typ: Type[FrameOrSeries], new_axes: Sequence[int] -) -> Dict[str, int]: + typ: Type[FrameOrSeries], new_axes: Sequence[Index] +) -> Dict[str, Index]: return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 86e125b6b909b..a1bebc92046ae 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -8,6 +8,8 @@ import tokenize from typing import Iterator, Tuple +from pandas._typing import Label + # A token value Python's tokenizer probably will never use. BACKTICK_QUOTED_STRING = 100 @@ -91,7 +93,7 @@ def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, tokval -def clean_column_name(name: str) -> str: +def clean_column_name(name: "Label") -> "Label": """ Function to emulate the cleaning of a backtick quoted name. @@ -102,12 +104,12 @@ def clean_column_name(name: str) -> str: Parameters ---------- - name : str + name : hashable Name to be cleaned. Returns ------- - name : str + name : hashable Returns the name after tokenizing and cleaning. Notes diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 6ec637a8b4845..0498d4d171c00 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -430,6 +430,10 @@ def visit_Subscript(self, node, **kwargs): except AttributeError: pass + if isinstance(slobj, Term): + # In py39 np.ndarray lookups with Term containing int raise + slobj = slobj.value + try: return self.const_type(value[slobj], self.env) except TypeError as err: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 7901e150a7ff4..f9ebe3f1e185e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -351,7 +351,7 @@ def array( return result -def extract_array(obj: AnyArrayLike, extract_numpy: bool = False) -> ArrayLike: +def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayLike]: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -399,9 +399,7 @@ def extract_array(obj: AnyArrayLike, extract_numpy: bool = False) -> ArrayLike: if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() - # error: Incompatible return value type (got "Index", expected "ExtensionArray") - # error: Incompatible return value type (got "Series", expected "ExtensionArray") - return obj # type: ignore[return-value] + return obj def sanitize_array( diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 8630867c64f88..c2be81cd46b3b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -99,9 +99,8 @@ def __eq__(self, other: Any) -> bool: By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type and all of the - the attributes in ``self._metadata`` are equal between - `self` and `other`. + * it's an instance of this type and all of the attributes + in ``self._metadata`` are equal between `self` and `other`. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9758eae60c262..0f0e82f4ad4e2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -385,13 +385,17 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg - # Everything can be be converted to StringArrays, but we may not want to convert - if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": + # Everything can be converted to StringArrays, but we may not want to convert + if ( + issubclass(cls, (StringArray, ArrowStringArray)) + and lib.infer_dtype(obj) != "string" + ): return obj try: @@ -1196,7 +1200,7 @@ def soft_convert_objects( elif conversion_count > 1 and coerce: raise ValueError( "Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True." + "'timedelta' can be True when coerce=True." ) if not is_object_dtype(values.dtype): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 14184f044ae95..b4f6d587c6642 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1727,7 +1727,7 @@ def _validate_date_like_dtype(dtype) -> None: ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the - the frequency provided is too specific) + frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a38d9cbad0d64..a9b0498081511 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,11 +21,11 @@ from pandas.core.construction import array -def _get_dtype_kinds(l) -> Set[str]: +def _get_dtype_kinds(arrays) -> Set[str]: """ Parameters ---------- - l : list of arrays + arrays : list of arrays Returns ------- @@ -33,7 +33,7 @@ def _get_dtype_kinds(l) -> Set[str]: A set of kinds that exist in this list of arrays. """ typs: Set[str] = set() - for arr in l: + for arr in arrays: # Note: we use dtype.kind checks because they are much more performant # than is_foo_dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 01b34187997cb..07280702cf06f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -47,7 +47,7 @@ class PandasExtensionDtype(ExtensionDtype): type: Any kind: Any # The Any type annotations above are here only because mypy seems to have a - # problem dealing with with multiple inheritance from PandasExtensionDtype + # problem dealing with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 7d2549713c6bc..0e5867809fe52 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -1,4 +1,11 @@ """ define generic base classes for pandas objects """ +from __future__ import annotations + +from typing import TYPE_CHECKING, Type, cast + +if TYPE_CHECKING: + from pandas import DataFrame, Series + from pandas.core.generic import NDFrame # define abstract base classes to enable isinstance type checking on our @@ -16,7 +23,6 @@ def _check(cls, inst) -> bool: return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) @@ -53,9 +59,17 @@ def _check(cls, inst) -> bool: }, ) -ABCNDFrame = create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")) -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +ABCNDFrame = cast( + "Type[NDFrame]", + create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")), +) +ABCSeries = cast( + "Type[Series]", + create_pandas_abc_type("ABCSeries", "_typ", ("series",)), +) +ABCDataFrame = cast( + "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +) ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bae06339a1e60..c9030a0b2423a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,7 +118,7 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, common as com, nanops, ops +from pandas.core import algorithms, common as com, generic, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import ( aggregate, @@ -159,7 +159,7 @@ from pandas.io.common import get_handle from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import DataFrameInfo +from pandas.io.formats.info import BaseInfo, DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -205,12 +205,14 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -221,6 +223,11 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -341,6 +348,44 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 """ @@ -434,6 +479,7 @@ class DataFrame(NDFrame, OpsMixin): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property def _constructor(self) -> Type[DataFrame]: @@ -726,7 +772,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(l) for l in value.split("\n")) + repr_width = max(len(line) for line in value.split("\n")) return repr_width < width @@ -2066,6 +2112,7 @@ def _from_arrays( ) return cls(mgr) + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, @@ -2118,7 +2165,7 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {114, 117, 118, 119, None}, default 114 + version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and @@ -2147,23 +2194,17 @@ def to_stata( compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies - compression mode. Compression mode must be one of {'infer', 'gzip', - 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and + compression mode. Compression mode must be one of {{'infer', 'gzip', + 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and `fname` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). If dict and compression mode is one of {'zip', - 'gzip', 'bz2'}, or inferred as one of the above, other entries + compression). If dict and compression mode is one of {{'zip', + 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. .. versionadded:: 1.1.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -2186,9 +2227,9 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', + >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}) + ... 'speed': [350, 18, 361, 15]}}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): @@ -2255,6 +2296,7 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], examples="""Examples -------- >>> df = pd.DataFrame( @@ -2307,6 +2349,7 @@ def to_markdown( handles.handle.writelines(result) return None + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, @@ -2340,12 +2383,12 @@ def to_parquet( Previously this was "fname" - engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. - compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' + compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. @@ -2365,13 +2408,7 @@ def to_parquet( .. versionadded:: 0.24.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -2398,7 +2435,7 @@ def to_parquet( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) >>> df.to_parquet('df.parquet.gzip', ... compression='gzip') # doctest: +SKIP >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP @@ -2532,16 +2569,28 @@ def to_html( @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=( - """max_cols : int, optional + max_cols_sub=dedent( + """\ + max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """ + ``pandas.options.display.max_info_columns`` is used.""" ), - examples_sub=( - """ + show_counts_sub=dedent( + """\ + show_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + null_counts : bool, optional + .. deprecated:: 1.2.0 + Use show_counts instead.""" + ), + examples_sub=dedent( + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2624,31 +2673,42 @@ def to_html( dtypes: object(3) memory usage: 165.9 MB""" ), - see_also_sub=( - """ + see_also_sub=dedent( + """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), + version_added_sub="", ) - @doc(DataFrameInfo.to_buffer) + @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, + show_counts: Optional[bool] = None, null_counts: Optional[bool] = None, ) -> None: + if null_counts is not None: + if show_counts is not None: + raise ValueError("null_counts used with show_counts. Use show_counts.") + warnings.warn( + "null_counts is deprecated. Use show_counts instead", + FutureWarning, + stacklevel=2, + ) + show_counts = null_counts info = DataFrameInfo( data=self, memory_usage=memory_usage, ) - info.to_buffer( + info.render( buf=buf, max_cols=max_cols, verbose=verbose, - show_counts=null_counts, + show_counts=show_counts, ) def memory_usage(self, index=True, deep=False) -> Series: @@ -2733,7 +2793,7 @@ def memory_usage(self, index=True, deep=False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5216 + 5244 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -2934,7 +2994,7 @@ def __getitem__(self, key): if is_hashable(key): # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: - if self.columns.nlevels > 1: + if isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) return self._get_item_cache(key) @@ -4570,7 +4630,7 @@ def set_index( append : bool, default False Whether to append columns to existing index. inplace : bool, default False - Modify the DataFrame in place (do not create a new object). + If True, modifies the DataFrame in place (do not create a new object). verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this @@ -5971,13 +6031,16 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): # maybe_align_as_frame ensures we do not have an ndarray here assert not isinstance(right, np.ndarray) - arrays = [array_op(l, r) for l, r in zip(self._iter_column_arrays(), right)] + arrays = [ + array_op(_left, _right) + for _left, _right in zip(self._iter_column_arrays(), right) + ] elif isinstance(right, Series): assert right.index.equals(self.index) # Handle other cases later right = right._values - arrays = [array_op(l, right) for l in self._iter_column_arrays()] + arrays = [array_op(left, right) for left in self._iter_column_arrays()] else: # Remaining cases have less-obvious dispatch rules @@ -6469,7 +6532,7 @@ def update( 1 b e 2 c f - For Series, it's name attribute must be set. + For Series, its name attribute must be set. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) @@ -8065,6 +8128,15 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) return merge( self, other, @@ -8765,7 +8837,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only is not None: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -8790,36 +8862,14 @@ def _get_data() -> DataFrame: # GH#35865 careful to cast explicitly to object nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) out[:] = np.array(nvs, dtype=object) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) return out assert numeric_only is None - if not self._is_homogeneous_type or self._mgr.any_extension_types: - # try to avoid self.values call - - if filter_type is None and axis == 0: - # operate column-wise - - # numeric_only must be None here, as other cases caught above - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=func, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result - data = self values = data.values diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3392b64890cb7..c7448cf8f8e40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -70,6 +70,7 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_float, is_list_like, @@ -86,7 +87,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import indexing, missing, nanops +from pandas.core import arraylike, indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -511,7 +512,7 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]] return d @final - def _get_index_resolvers(self) -> Dict[str, Union[Series, MultiIndex]]: + def _get_index_resolvers(self) -> Dict[Label, Union[Series, MultiIndex]]: from pandas.core.computation.parsing import clean_column_name d: Dict[str, Union[Series, MultiIndex]] = {} @@ -521,7 +522,7 @@ def _get_index_resolvers(self) -> Dict[str, Union[Series, MultiIndex]]: return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} @final - def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + def _get_cleaned_column_resolvers(self) -> Dict[Label, Series]: """ Return the special character free column resolvers of a dataframe. @@ -532,7 +533,6 @@ def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: from pandas.core.computation.parsing import clean_column_name if isinstance(self, ABCSeries): - self = cast("Series", self) return {clean_column_name(self.name): self} return { @@ -1114,7 +1114,7 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): In this case, the parameter ``copy`` is ignored. The second calling convention will modify the names of the - the corresponding index if mapper is a list or a scalar. + corresponding index if mapper is a list or a scalar. However, if mapper is dict-like or a function, it will use the deprecated behavior of modifying the axis *labels*. @@ -1927,6 +1927,11 @@ def __array_wrap__( self, method="__array_wrap__" ) + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) + # ideally we would define this to avoid the getattr checks, but # is slower # @property @@ -2024,7 +2029,7 @@ def _repr_data_resource_(self): # I/O Methods @final - @doc(klass="object") + @doc(klass="object", storage_options=_shared_docs["storage_options"]) def to_excel( self, excel_writer, @@ -2101,10 +2106,7 @@ def to_excel( freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". + {storage_options} .. versionadded:: 1.2.0 @@ -2185,6 +2187,7 @@ def to_excel( ) @final + @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -2217,27 +2220,27 @@ def to_json( * Series: - default is 'index' - - allowed values are: {'split', 'records', 'index', 'table'}. + - allowed values are: {{'split', 'records', 'index', 'table'}}. * DataFrame: - default is 'columns' - - allowed values are: {'split', 'records', 'index', 'columns', - 'values', 'table'}. + - allowed values are: {{'split', 'records', 'index', 'columns', + 'values', 'table'}}. * The format of the JSON string: - - 'split' : dict like {'index' -> [index], 'columns' -> [columns], - 'data' -> [values]} - - 'records' : list like [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - 'columns' : dict like {column -> {index -> value}} + - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], + 'data' -> [values]}} + - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] + - 'index' : dict like {{index -> {{column -> value}}}} + - 'columns' : dict like {{column -> {{index -> value}}}} - 'values' : just the values array - - 'table' : dict like {'schema': {schema}, 'data': {data}} + - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} Describing the data, where data component is like ``orient='records'``. - date_format : {None, 'epoch', 'iso'} + date_format : {{None, 'epoch', 'iso'}} Type of date conversion. 'epoch' = epoch milliseconds, 'iso' = ISO8601. The default depends on the `orient`. For ``orient='table'``, the default is 'iso'. For all other orients, @@ -2260,7 +2263,7 @@ def to_json( throw ValueError if incorrect 'orient' since others are not list like. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} A string representing the compression to use in the output file, only used when the first argument is a filename. By default, the @@ -2277,13 +2280,7 @@ def to_json( .. versionadded:: 1.0.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -2320,7 +2317,7 @@ def to_json( >>> result = df.to_json(orient="split") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { + {{ "columns": [ "col 1", "col 2" @@ -2339,7 +2336,7 @@ def to_json( "d" ] ] - } + }} Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. @@ -2348,14 +2345,14 @@ def to_json( >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP [ - { + {{ "col 1": "a", "col 2": "b" - }, - { + }}, + {{ "col 1": "c", "col 2": "d" - } + }} ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: @@ -2363,32 +2360,32 @@ def to_json( >>> result = df.to_json(orient="index") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "row 1": { + {{ + "row 1": {{ "col 1": "a", "col 2": "b" - }, - "row 2": { + }}, + "row 2": {{ "col 1": "c", "col 2": "d" - } - } + }} + }} Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: >>> result = df.to_json(orient="columns") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "col 1": { + {{ + "col 1": {{ "row 1": "a", "row 2": "c" - }, - "col 2": { + }}, + "col 2": {{ "row 1": "b", "row 2": "d" - } - } + }} + }} Encoding/decoding a Dataframe using ``'values'`` formatted JSON: @@ -2411,40 +2408,40 @@ def to_json( >>> result = df.to_json(orient="table") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "schema": { + {{ + "schema": {{ "fields": [ - { + {{ "name": "index", "type": "string" - }, - { + }}, + {{ "name": "col 1", "type": "string" - }, - { + }}, + {{ "name": "col 2", "type": "string" - } + }} ], "primaryKey": [ "index" ], "pandas_version": "0.20.0" - }, + }}, "data": [ - { + {{ "index": "row 1", "col 1": "a", "col 2": "b" - }, - { + }}, + {{ "index": "row 2", "col 1": "c", "col 2": "d" - } + }} ] - } + }} """ from pandas.io import json @@ -2725,7 +2722,7 @@ def to_sql( >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] - An `sqlalchemy.engine.Connection` can also be passed to to `con`: + An `sqlalchemy.engine.Connection` can also be passed to `con`: >>> with engine.begin() as connection: ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) @@ -2783,6 +2780,7 @@ def to_sql( ) @final + @doc(storage_options=_shared_docs["storage_options"]) def to_pickle( self, path, @@ -2797,7 +2795,7 @@ def to_pickle( ---------- path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \ default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. @@ -2809,13 +2807,7 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -2828,7 +2820,7 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) >>> original_df foo bar 0 0 5 @@ -3193,6 +3185,7 @@ def to_latex( ) @final + @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -3272,11 +3265,11 @@ def to_csv( compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following - possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. @@ -3333,13 +3326,7 @@ def to_csv( .. versionadded:: 1.1.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -3356,9 +3343,9 @@ def to_csv( Examples -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) + ... 'weapon': ['sai', 'bo staff']}}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' @@ -3722,6 +3709,8 @@ class animal locomotion else: index = self.index + self._consolidate_inplace() + if isinstance(index, MultiIndex): try: loc, new_index = index._get_loc_level( @@ -3759,7 +3748,7 @@ class animal locomotion dtype=new_values.dtype, ) elif is_scalar(loc): - result = self.iloc[:, [loc]] + result = self.iloc[:, slice(loc, loc + 1)] elif axis == 1: result = self.iloc[:, loc] else: @@ -3785,7 +3774,7 @@ def _get_item_cache(self, item): loc = self.columns.get_loc(item) values = self._mgr.iget(loc) - res = self._box_col_values(values, loc) + res = self._box_col_values(values, loc).__finalize__(self) cache[item] = res res._set_as_cached(item, self) @@ -5501,7 +5490,7 @@ def __setattr__(self, name: str, value) -> None: def _dir_additions(self) -> Set[str]: """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, it's first level values are used. + If info_axis is a MultiIndex, its first level values are used. """ additions = super()._dir_additions() if self._info_axis._can_hold_strings: @@ -6340,6 +6329,8 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) + self._consolidate_inplace() + # set the default here, so functions examining the signaure # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: @@ -6762,6 +6753,8 @@ def replace( if not is_bool(regex) and to_replace is not None: raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") + self._consolidate_inplace() + if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat @@ -9020,7 +9013,6 @@ def _where( cond = -cond if inplace else cond # try to align with other - try_quick = True if isinstance(other, NDFrame): # align with me @@ -9059,12 +9051,11 @@ def _where( # match True cond to other elif len(cond[icond]) == len(other): - # try to not change dtype at first (if try_quick) - if try_quick: - new_other = np.asarray(self) - new_other = new_other.copy() - new_other[icond] = other - other = new_other + # try to not change dtype at first + new_other = np.asarray(self) + new_other = new_other.copy() + new_other[icond] = other + other = new_other else: raise ValueError( @@ -11286,7 +11277,11 @@ def _inplace_method(self, other, op): """ result = op(self, other) - if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype: + if ( + self.ndim == 1 + and result._indexed_same(self) + and is_dtype_equal(result.dtype, self.dtype) + ): # GH#36498 this inplace op can _actually_ be inplace. self._values[:] = result._values return self @@ -11850,7 +11845,7 @@ def _doc_parms(cls): _any_desc = """\ Return whether any element is True, potentially over an axis. -Returns False unless there at least one element within a series or +Returns False unless there is at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty).""" diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index f205226c03a53..7dc0db35bf8fe 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -192,6 +192,7 @@ def _gotitem(self, key, ndim, subset=None): "describe", "dtypes", "expanding", + "ewm", "filter", "get_group", "groups", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3395b9d36fd0c..244c47cd1f1ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -262,7 +262,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, - # see see test_groupby.test_basic + # see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) @@ -1390,8 +1390,7 @@ def _transform_fast(self, result: DataFrame) -> DataFrame: """ obj = self._obj_with_exclusions - # for each col, reshape to to size of original frame - # by take operation + # for each col, reshape to size of original frame by take operation ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) output = [ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ec96a0d502d3f..ae3612c99d5cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1600,10 +1600,8 @@ def sem(self, ddof: int = 1): cols = result.columns.get_indexer_for( result.columns.difference(self.exclusions).unique() ) - # TODO(GH-22046) - setting with iloc broken if labels are not unique - # .values to remove labels - result.iloc[:, cols] = ( - result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values + result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( + self.count().iloc[:, cols] ) return result @@ -1671,10 +1669,10 @@ def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): """Helper function for first item that isn't NA.""" - x = x.array[notna(x.array)] - if len(x) == 0: + arr = x.array[notna(x.array)] + if not len(arr): return np.nan - return x[0] + return arr[0] if isinstance(obj, DataFrame): return obj.apply(first, axis=axis) @@ -1695,10 +1693,10 @@ def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): """Helper function for last item that isn't NA.""" - x = x.array[notna(x.array)] - if len(x) == 0: + arr = x.array[notna(x.array)] + if not len(arr): return np.nan - return x[-1] + return arr[-1] if isinstance(obj, DataFrame): return obj.apply(last, axis=axis) @@ -1859,6 +1857,16 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) + @Substitution(name="groupby") + @Appender(_common_see_also) + def ewm(self, *args, **kwargs): + """ + Return an ewm grouper, providing ewm functionality per group. + """ + from pandas.core.window import ExponentialMovingWindowGroupby + + return ExponentialMovingWindowGroupby(self, *args, **kwargs) + def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fc80852f00c95..50c4cc53a12bb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -148,7 +148,7 @@ def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": ------- Generator yielding subsetted objects - __finalize__ has not been called for the the subsetted objects returned. + __finalize__ has not been called for the subsetted objects returned. """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -603,7 +603,7 @@ def _aggregate( ): if agg_func is libgroupby.group_nth: # different signature from the others - agg_func(result, counts, values, comp_ids, rank=1) + agg_func(result, counts, values, comp_ids, min_count, rank=1) else: agg_func(result, counts, values, comp_ids, min_count) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index e48a42599a2a0..b6713bc760c5e 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -105,7 +105,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: return True if arr_value.ndim == 1: if not isinstance(indexer, tuple): - indexer = tuple([indexer]) + indexer = (indexer,) return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cb5641a74e60b..c49f3f9457161 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -415,6 +415,11 @@ def asi8(self): ndarray An ndarray with int64 dtype. """ + warnings.warn( + "Index.asi8 is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) return None @classmethod @@ -1481,7 +1486,7 @@ def _get_level_number(self, level) -> int: def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - For internal compatibility with with the Index API. + For internal compatibility with the Index API. Sort the Index. This is for compat with MultiIndex @@ -1570,6 +1575,33 @@ def droplevel(self, level=0): Returns ------- Index or MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + + >>> mi.droplevel() + MultiIndex([(3, 5), + (4, 6)], + names=['y', 'z']) + + >>> mi.droplevel(2) + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel('z') + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel(['x', 'y']) + Int64Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] @@ -2485,12 +2517,10 @@ def _get_unique_index(self, dropna: bool = False): else: values = self._values - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + if dropna and not isinstance(self, ABCMultiIndex): + # isna not defined for MultiIndex + if self.hasnans: + values = values[~isna(values)] return self._shallow_copy(values) @@ -2734,7 +2764,7 @@ def _union(self, other, sort): stacklevel=3, ) - return self._shallow_copy(result) + return result @final def _wrap_setop_result(self, other, result): @@ -2742,6 +2772,8 @@ def _wrap_setop_result(self, other, result): result, np.ndarray ): result = type(self._data)._simple_new(result, dtype=self.dtype) + elif is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): + result = Categorical(result, dtype=self.dtype) name = get_op_result_name(self, other) if isinstance(result, Index): @@ -2798,6 +2830,13 @@ def intersection(self, other, sort=False): other = other.astype("O") return this.intersection(other, sort=sort) + result = self._intersection(other, sort=sort) + return self._wrap_setop_result(other, result) + + def _intersection(self, other, sort=False): + """ + intersection specialized to the case with matching dtypes. + """ # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values @@ -2808,7 +2847,7 @@ def intersection(self, other, sort=False): except TypeError: pass else: - return self._wrap_setop_result(other, result) + return result try: indexer = Index(rvals).get_indexer(lvals) @@ -2824,7 +2863,7 @@ def intersection(self, other, sort=False): if sort is None: result = algos.safe_sort(result) - return self._wrap_setop_result(other, result) + return result def difference(self, other, sort=None): """ @@ -3163,7 +3202,7 @@ def _get_fill_indexer( indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) - if tolerance is not None: + if tolerance is not None and len(self): indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer @@ -3208,12 +3247,21 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: values that can be subtracted from each other (e.g., not strings or tuples). """ + if not len(self): + return self._get_fill_indexer(target, "pad") + left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) target_values = target._values - left_distances = np.abs(self._values[left_indexer] - target_values) - right_distances = np.abs(self._values[right_indexer] - target_values) + # error: Unsupported left operand type for - ("ExtensionArray") + left_distances = np.abs( + self._values[left_indexer] - target_values # type: ignore[operator] + ) + # error: Unsupported left operand type for - ("ExtensionArray") + right_distances = np.abs( + self._values[right_indexer] - target_values # type: ignore[operator] + ) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3232,7 +3280,8 @@ def _filter_indexer_tolerance( indexer: np.ndarray, tolerance, ) -> np.ndarray: - distance = abs(self._values[indexer] - target) + # error: Unsupported left operand type for - ("ExtensionArray") + distance = abs(self._values[indexer] - target) # type: ignore[operator] indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -3384,11 +3433,11 @@ def _convert_list_indexer(self, keyarr): return None @final - def _invalid_indexer(self, form: str_t, key): + def _invalid_indexer(self, form: str_t, key) -> TypeError: """ Consistent invalid indexer message. """ - raise TypeError( + return TypeError( f"cannot do {form} indexing on {type(self).__name__} with these " f"indexers [{key}] of type {type(key).__name__}" ) @@ -3436,6 +3485,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: + values: Union[range, ExtensionArray, np.ndarray] if isinstance(self, ABCRangeIndex): values = range(0) else: @@ -3508,7 +3558,7 @@ def _reindex_non_unique(self, target): cur_labels = self.take(indexer[check]).values cur_indexer = ensure_int64(length[check]) - new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels = np.empty((len(indexer),), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels @@ -3961,7 +4011,11 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return join_index - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index( + self: _IndexT, joined: np.ndarray, other: _IndexT + ) -> _IndexT: + assert other.dtype == self.dtype + if isinstance(self, ABCMultiIndex): name = self.names if self.names == other.names else None else: @@ -4163,7 +4217,7 @@ def _is_memory_usage_qualified(self) -> bool: """ return self.is_object() - def is_type_compatible(self, kind) -> bool: + def is_type_compatible(self, kind: str_t) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4319,11 +4373,9 @@ def putmask(self, mask, value): numpy.ndarray.putmask : Changes elements of an array based on conditional and input values. """ - values = self.values.copy() + values = self._values.copy() try: converted = self._validate_fill_value(value) - np.putmask(values, mask, converted) - return self._shallow_copy(values) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err @@ -4331,6 +4383,9 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) + np.putmask(values, mask, converted) + return self._shallow_copy(values) + def equals(self, other: object) -> bool: """ Determine if two Index object are equal. @@ -4396,7 +4451,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False - # If other is a subclass of self and defines it's own equals method, we + # If other is a subclass of self and defines its own equals method, we # dispatch to the subclass method. For instance for a MultiIndex, # a d-level MultiIndex can equal d-tuple Index. # Note: All EA-backed Index subclasses override equals @@ -4528,8 +4583,9 @@ def asof_locs(self, where: "Index", mask) -> np.ndarray: result = np.arange(len(self))[mask].take(locs) - first = mask.argmax() - result[(locs == 0) & (where._values < self._values[first])] = -1 + # TODO: overload return type of ExtensionArray.__getitem__ + first_value = cast(Any, self._values[mask.argmax()]) + result[(locs == 0) & (where._values < first_value)] = -1 return result @@ -4717,12 +4773,13 @@ def argsort(self, *args, **kwargs) -> np.ndarray: >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - result = self.asi8 - - if result is None: - result = np.array(self) + if needs_i8_conversion(self.dtype): + # TODO: these do not match the underlying EA argsort methods GH#37863 + return self.asi8.argsort(*args, **kwargs) - return result.argsort(*args, **kwargs) + # This works for either ndarray or EA, is overriden + # by RangeIndex, MultIIndex + return self._data.argsort(*args, **kwargs) @final def get_value(self, series: "Series", key): @@ -4839,6 +4896,14 @@ def set_value(self, arr, key, value): @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) + + if target.is_boolean() and self.is_numeric(): + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) @@ -5088,7 +5153,7 @@ def isin(self, values, level=None): """ if level is not None: self._validate_index_level(level) - return algos.isin(self, values) + return algos.isin(self._values, values) def _get_string_slice(self, key: str_t): # this is for partial string indexing, @@ -5173,7 +5238,7 @@ def _validate_indexer(self, form: str_t, key, kind: str_t): elif is_integer(key): pass else: - self._invalid_indexer(form, key) + raise self._invalid_indexer(form, key) def _maybe_cast_slice_bound(self, label, side: str_t, kind): """ @@ -5202,7 +5267,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind): # datetimelike Indexes # reject them, if index does not contain label if (is_float(label) or is_integer(label)) and label not in self.values: - self._invalid_indexer("slice", label) + raise self._invalid_indexer("slice", label) return label @@ -5466,6 +5531,17 @@ def _cmp_method(self, other, op): """ Wrapper used to dispatch comparison operations. """ + if self.is_(other): + # fastpath + if op in {operator.eq, operator.le, operator.ge}: + arr = np.ones(len(self), dtype=bool) + if self._can_hold_na and not isinstance(self, ABCMultiIndex): + # TODO: should set MultiIndex._can_hold_na = False? + arr[self.isna()] = False + return arr + elif op in {operator.ne, operator.lt, operator.gt}: + return np.zeros(len(self), dtype=bool) + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): if len(self) != len(other): raise ValueError("Lengths must match to compare") diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 06df8f85cded7..e2507aeaeb652 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,4 +1,4 @@ -from typing import Any, List +from typing import Any, List, Optional import warnings import numpy as np @@ -6,7 +6,6 @@ from pandas._config import get_option from pandas._libs import index as libindex -from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Label from pandas.util._decorators import Appender, cache_readonly, doc @@ -14,10 +13,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_categorical_dtype, - is_interval_dtype, - is_list_like, is_scalar, - pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna @@ -224,16 +220,25 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._cache = {} result._reset_identity() - result._no_setting_name = False return result # -------------------------------------------------------------------- + # error: Argument 1 of "_shallow_copy" is incompatible with supertype + # "ExtensionIndex"; supertype defines the argument type as + # "Optional[ExtensionArray]" [override] @doc(Index._shallow_copy) - def _shallow_copy(self, values=None, name: Label = no_default): + def _shallow_copy( # type:ignore[override] + self, + values: Optional[Categorical] = None, + name: Label = no_default, + ): name = self.name if name is no_default else name if values is not None: + # In tests we only get here with Categorical objects that + # have matching .ordered, and values.categories a subset of + # our own. However we do _not_ have a dtype match in general. values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name) @@ -245,6 +250,10 @@ def _is_dtype_compat(self, other) -> Categorical: provide a comparison between the dtype of self and other (coercing if needed) + Parameters + ---------- + other : Index + Returns ------- Categorical @@ -261,8 +270,6 @@ def _is_dtype_compat(self, other) -> Categorical: ) else: values = other - if not is_list_like(values): - values = [values] cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) @@ -356,11 +363,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. @@ -371,20 +373,8 @@ def __contains__(self, key: Any) -> bool: @doc(Index.astype) def astype(self, dtype, copy=True): - if dtype is not None: - dtype = pandas_dtype(dtype) - - if is_interval_dtype(dtype): - from pandas import IntervalIndex - - return IntervalIndex(np.array(self)) - elif is_categorical_dtype(dtype): - # GH 18630 - dtype = self.dtype.update_dtype(dtype) - if dtype == self.dtype: - return self.copy() if copy else self - - return Index.astype(self, dtype=dtype, copy=copy) + res_data = self._data.astype(dtype, copy=copy) + return Index(res_data, name=self.name) @doc(Index.fillna) def fillna(self, value, downcast=None): @@ -409,27 +399,10 @@ def unique(self, level=None): # of result, not self. return type(self)._simple_new(result, name=self.name) - @doc(Index.duplicated) - def duplicated(self, keep="first"): - codes = self.codes.astype("i8") - return duplicated_int64(codes, keep) - def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype("object") - @doc(Index.where) - def where(self, cond, other=None): - # TODO: Investigate an alternative implementation with - # 1. copy the underlying Categorical - # 2. setitem with `cond` and `other` - # 3. Rebuild CategoricalIndex. - if other is None: - other = self._na_value - values = np.where(cond, self._values, other) - cat = Categorical(values, dtype=self.dtype) - return type(self)._simple_new(cat, name=self.name) - def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -491,7 +464,8 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): - new_target = target._shallow_copy(new_target, name=self.name) + new_target = Categorical(new_target, dtype=target.dtype) + new_target = type(self)._simple_new(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -514,7 +488,8 @@ def _reindex_non_unique(self, target): if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories - new_target = self._shallow_copy(new_target) + new_target = Categorical(new_target, dtype=self.dtype) + new_target = type(self)._simple_new(new_target, name=self.name) return new_target, indexer, new_indexer @@ -529,53 +504,38 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase.ensure_index(target) + self._check_indexing_method(method) + if self.is_unique and self.equals(target): return np.arange(len(self), dtype="intp") - if method == "pad" or method == "backfill": - raise NotImplementedError( - "method='pad' and method='backfill' not " - "implemented yet for CategoricalIndex" - ) - elif method == "nearest": - raise NotImplementedError( - "method='nearest' not implemented yet for CategoricalIndex" - ) - - # Note: we use engine.get_indexer_non_unique below because, even if - # `target` is unique, any non-category entries in it will be encoded - # as -1 by _get_codes_for_get_indexer, so `codes` may not be unique. - codes = self._get_codes_for_get_indexer(target._values) - indexer, _ = self._engine.get_indexer_non_unique(codes) - return ensure_platform_int(indexer) + return self._get_indexer_non_unique(target._values)[0] @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) + return self._get_indexer_non_unique(target._values) - codes = self._get_codes_for_get_indexer(target._values) - indexer, missing = self._engine.get_indexer_non_unique(codes) - return ensure_platform_int(indexer), missing - - def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray: + def _get_indexer_non_unique(self, values: ArrayLike): """ - Extract integer codes we can use for comparison. - - Notes - ----- - If a value in target is not present, it gets coded as -1. + get_indexer_non_unique but after unrapping the target Index object. """ + # Note: we use engine.get_indexer_non_unique for get_indexer in addition + # to get_indexer_non_unique because, even if `target` is unique, any + # non-category entries in it will be encoded as -1 so `codes` may + # not be unique. - if isinstance(target, Categorical): + if isinstance(values, Categorical): # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - cat = self._data._encode_with_my_categories(target) + cat = self._data._encode_with_my_categories(values) codes = cat._codes else: - codes = self.categories.get_indexer(target) + codes = self.categories.get_indexer(values) - return codes + indexer, missing = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer), missing @doc(Index._convert_list_indexer) def _convert_list_indexer(self, keyarr): @@ -583,23 +543,11 @@ def _convert_list_indexer(self, keyarr): # the categories if self.categories._defer_to_indexing: + # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - msg = "a list-indexer must only include values that are in the categories" - if self.hasnans: - msg += " or NA" - try: - codes = self._data._validate_setitem_value(keyarr) - except (ValueError, TypeError) as err: - if "Index data must be 1-dimensional" in str(err): - # e.g. test_setitem_ndarray_3d - raise - raise KeyError(msg) - if not self.hasnans and (codes == -1).any(): - raise KeyError(msg) - - return self.get_indexer(keyarr) + return self.get_indexer_for(keyarr) @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side: str, kind): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 40a6086f69f85..1b18f04ba603d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,7 +2,7 @@ Base and utility classes for tseries type pandas objects. """ from datetime import datetime -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, TypeVar, Union, cast +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, TypeVar, Union, cast import numpy as np @@ -10,7 +10,6 @@ from pandas._libs.tslibs import BaseOffset, Resolution, Tick from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( @@ -23,12 +22,10 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCSeries -from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -from pandas.core.base import IndexOpsMixin import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs @@ -56,16 +53,22 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): # error: 'staticmethod' used with a non-method @staticmethod # type: ignore[misc] def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + # Note: these only get called with left.dtype == right.dtype + if isinstance( + left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) + ): left = left.view("i8") - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + if isinstance( + right, + (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), + ): right = right.view("i8") results = joinf(left, right) if with_indexers: # dtype should be timedelta64[ns] for TimedeltaIndex # and datetime64[ns] for DatetimeIndex - dtype = left.dtype.base + dtype = cast(np.dtype, left.dtype).base join_index, left_indexer, right_indexer = results join_index = join_index.view(dtype) @@ -88,6 +91,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): _can_hold_strings = False _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] + _data_cls: Union[Type[DatetimeArray], Type[TimedeltaArray], Type[PeriodArray]] freq: Optional[BaseOffset] freqstr: Optional[str] _resolution_obj: Resolution @@ -100,6 +104,25 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): ) _hasnans = hasnans # for index / array -agnostic code + @classmethod + def _simple_new( + cls, + values: Union[DatetimeArray, TimedeltaArray, PeriodArray], + name: Label = None, + ): + assert isinstance(values, cls._data_cls), type(values) + + result = object.__new__(cls) + result._data = values + result._name = name + result._cache = {} + + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + + result._reset_identity() + return result + @property def _is_all_dates(self) -> bool: return True @@ -140,16 +163,8 @@ def equals(self, other: object) -> bool: elif other.dtype.kind in ["f", "i", "u", "c"]: return False elif not isinstance(other, type(self)): - inferrable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] - should_try = False + inferrable = self._data._infer_matches if other.dtype == object: should_try = other.inferred_type in inferrable elif is_categorical_dtype(other.dtype): @@ -198,10 +213,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): result._data._freq = freq return result - @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") - def searchsorted(self, value, side="left", sorter=None): - return self._data.searchsorted(value, side=side, sorter=sorter) - _can_hold_na = True _na_value = NaT @@ -237,23 +248,23 @@ def min(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - try: + + if len(i8) and self.is_monotonic_increasing: # quick check - if len(i8) and self.is_monotonic: - if i8[0] != iNaT: - return self._data._box_func(i8[0]) - - if self.hasnans: - if skipna: - min_stamp = self[~self._isnan].asi8.min() - else: - return self._na_value - else: - min_stamp = i8.min() - return self._data._box_func(min_stamp) - except ValueError: + if i8[0] != iNaT: + return self._data._box_func(i8[0]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): return self._na_value + min_stamp = i8.min() + return self._data._box_func(min_stamp) + def argmin(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the minimum values along an axis. @@ -294,23 +305,23 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - try: + + if len(i8) and self.is_monotonic: # quick check - if len(i8) and self.is_monotonic: - if i8[-1] != iNaT: - return self._data._box_func(i8[-1]) - - if self.hasnans: - if skipna: - max_stamp = self[~self._isnan].asi8.max() - else: - return self._na_value - else: - max_stamp = i8.max() - return self._data._box_func(max_stamp) - except ValueError: + if i8[-1] != iNaT: + return self._data._box_func(i8[-1]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): return self._na_value + max_stamp = i8.max() + return self._data._box_func(max_stamp) + def argmax(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the maximum values along an axis. @@ -369,7 +380,7 @@ def _format_with_header( @property def _formatter_func(self): - raise AbstractMethodError(self) + return self._data._formatter() def _format_attrs(self): """ @@ -384,6 +395,36 @@ def _format_attrs(self): attrs.append(("freq", freq)) return attrs + def _summary(self, name=None) -> str: + """ + Return a summarized representation. + + Parameters + ---------- + name : str + Name to use in the summary representation. + + Returns + ------- + str + Summarized representation of the index. + """ + formatter = self._formatter_func + if len(self) > 0: + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + result = f"{name}: {len(self)} entries{index_summary}" + if self.freq: + result += f"\nFreq: {self.freqstr}" + + # display as values, not quoted + result = result.replace("'", "") + return result + # -------------------------------------------------------------------- # Indexing Methods @@ -414,7 +455,7 @@ def _partial_date_slice( vals = self._data._ndarray unbox = self._data._unbox - if self.is_monotonic: + if self.is_monotonic_increasing: if len(self) and ( (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) @@ -456,68 +497,6 @@ def _partial_date_slice( __truediv__ = make_wrapped_arith_op("__truediv__") __rtruediv__ = make_wrapped_arith_op("__rtruediv__") - def isin(self, values, level=None): - """ - Compute boolean array of whether each index value is found in the - passed set of values. - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - is_contained : ndarray (boolean dtype) - """ - if level is not None: - self._validate_index_level(level) - - if not isinstance(values, type(self)): - try: - values = type(self)(values) - except ValueError: - return self.astype(object).isin(values) - - return algorithms.isin(self.asi8, values.asi8) - - @Appender(Index.where.__doc__) - def where(self, cond, other=None): - other = self._data._validate_setitem_value(other) - - result = np.where(cond, self._data._ndarray, other) - arr = self._data._from_backing_data(result) - return type(self)._simple_new(arr, name=self.name) - - def _summary(self, name=None) -> str: - """ - Return a summarized representation. - - Parameters - ---------- - name : str - Name to use in the summary representation. - - Returns - ------- - str - Summarized representation of the index. - """ - formatter = self._formatter_func - if len(self) > 0: - index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" - else: - index_summary = "" - - if name is None: - name = type(self).__name__ - result = f"{name}: {len(self)} entries{index_summary}" - if self.freq: - result += f"\nFreq: {self.freqstr}" - - # display as values, not quoted - result = result.replace("'", "") - return result - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -662,15 +641,13 @@ def _with_freq(self, freq): arr = self._data._with_freq(freq) return type(self)._simple_new(arr, name=self.name) - def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = self.name if name is lib.no_default else name - - if values is not None: - return self._simple_new(values, name=name) + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return False - result = self._simple_new(self._data, name=name) - result._cache = self._cache - return result + def is_type_compatible(self, kind: str) -> bool: + return kind in self._data._infer_matches # -------------------------------------------------------------------- # Set Operation Methods @@ -745,15 +722,14 @@ def intersection(self, other, sort=False): start = right[0] if end < start: - # pandas\core\indexes\datetimelike.py:758: error: Unexpected - # keyword argument "freq" for "DatetimeTimedeltaMixin" [call-arg] - result = type(self)( - data=[], dtype=self.dtype, freq=self.freq # type: ignore[call-arg] - ) + result = self[:0] else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left._values[lslice] - result = type(self)._simple_new(left_chunk) + # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has + # incompatible type "Union[ExtensionArray, Any]"; expected + # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" [arg-type] + result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] return self._wrap_setop_result(other, result) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9744eb0ecbb88..f6eeb121b1ac0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -14,17 +14,14 @@ to_offset, ) from pandas._libs.tslibs.offsets import prefix_mapping -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_float, - is_integer, is_scalar, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype @@ -220,6 +217,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _typ = "datetimeindex" + _data_cls = DatetimeArray _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True @@ -319,20 +317,6 @@ def __new__( subarr = cls._simple_new(dtarr, name=name) return subarr - @classmethod - def _simple_new(cls, values: DatetimeArray, name: Label = None): - assert isinstance(values, DatetimeArray), type(values) - - result = object.__new__(cls) - result._data = values - result.name = name - result._cache = {} - result._no_setting_name = False - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - result._reset_identity() - return result - # -------------------------------------------------------------------- @cache_readonly @@ -367,8 +351,6 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ - if not is_datetime64_any_dtype(dtype): - return False if self.tz is not None: # If we have tz, we can compare to tzaware return is_datetime64tz_dtype(dtype) @@ -387,7 +369,7 @@ def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 formatter = get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: f"'{formatter(x, tz=self.tz)}'" + return lambda x: f"'{formatter(x)}'" # -------------------------------------------------------------------- # Set Operation Methods @@ -733,12 +715,13 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): """ assert kind in ["loc", "getitem", None] - if is_float(label) or isinstance(label, time) or is_integer(label): - self._invalid_indexer("slice", label) - if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - parsed, reso = parsing.parse_time_string(label, freq) + try: + parsed, reso = parsing.parse_time_string(label, freq) + except parsing.DateParseError as err: + raise self._invalid_indexer("slice", label) from err + reso = Resolution.from_attrname(reso) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -752,6 +735,9 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return lower if side == "left" else upper elif isinstance(label, (self._data._recognized_scalars, date)): self._deprecate_mismatched_indexing(label) + else: + raise self._invalid_indexer("slice", label) + return self._maybe_cast_for_get_loc(label) def _get_string_slice(self, key: str): @@ -803,14 +789,25 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): end is None or isinstance(end, str) ): mask = np.array(True) + deprecation_mask = np.array(True) if start is not None: start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self + deprecation_mask = start_casted == self if end is not None: end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask - + deprecation_mask = (end_casted == self) | deprecation_mask + + if not deprecation_mask.any(): + warnings.warn( + "Value based partial slicing on non-monotonic DatetimeIndexes " + "with non-existing keys is deprecated and will raise a " + "KeyError in a future Version.", + FutureWarning, + stacklevel=5, + ) indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): return slice(None) @@ -821,9 +818,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- - def is_type_compatible(self, typ) -> bool: - return typ == self.inferred_type or typ == "datetime" - @property def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4d09a97b18eed..3f146e273326c 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -1,10 +1,12 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ -from typing import List, TypeVar +from typing import List, Optional, TypeVar import numpy as np +from pandas._libs import lib +from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -211,6 +213,24 @@ class ExtensionIndex(Index): __le__ = _make_wrapped_comparison_op("__le__") __ge__ = _make_wrapped_comparison_op("__ge__") + @doc(Index._shallow_copy) + def _shallow_copy( + self, values: Optional[ExtensionArray] = None, name: Label = lib.no_default + ): + name = self.name if name is lib.no_default else name + + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache + return result + + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + # --------------------------------------------------------------------- # NDarray-Like Methods @@ -228,15 +248,34 @@ def __getitem__(self, key): deprecate_ndim_indexing(result) return result + def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + # overriding IndexOpsMixin improves performance GH#38083 + return self._data.searchsorted(value, side=side, sorter=sorter) + # --------------------------------------------------------------------- + def _check_indexing_method(self, method): + """ + Raise if we have a get_indexer `method` that is not supported or valid. + """ + # GH#37871 for now this is only for IntervalIndex and CategoricalIndex + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for {type(self).__name__}" + ) + + raise ValueError("Invalid fill method") + def _get_engine_target(self) -> np.ndarray: return np.asarray(self._data) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) result = self._data.repeat(repeats, axis=axis) - return self._shallow_copy(result) + return type(self)._simple_new(result, name=self.name) def insert(self, loc: int, item): # ExtensionIndex subclasses must override Index.insert @@ -343,16 +382,19 @@ def insert(self, loc: int, item): new_arr = arr._from_backing_data(new_vals) return type(self)._simple_new(new_arr, name=self.name) + @doc(Index.where) + def where(self, cond, other=None): + res_values = self._data.where(cond, other) + return type(self)._simple_new(res_values, name=self.name) + def putmask(self, mask, value): + res_values = self._data.copy() try: - value = self._data._validate_setitem_value(value) + res_values.putmask(mask, value) except (TypeError, ValueError): return self.astype(object).putmask(mask, value) - new_values = self._data._ndarray.copy() - np.putmask(new_values, mask, value) - new_arr = self._data._from_backing_data(new_values) - return type(self)._simple_new(new_arr, name=self.name) + return type(self)._simple_new(res_values, name=self.name) def _wrap_joined_index(self: _T, joined: np.ndarray, other: _T) -> _T: name = get_op_result_name(self, other) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2aec86c9cdfae..ed92b3dade6a0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -130,19 +130,13 @@ def wrapped(self, other, sort=False): if op_name in ("difference",): result = result.astype(self.dtype) return result - elif self.closed != other.closed: - raise ValueError( - "can only do set operations between two IntervalIndex " - "objects that are closed on the same side" - ) - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): + if self._is_non_comparable_own_type(other): + # GH#19016: ensure set op will not return a prohibited dtype raise TypeError( - f"can only do {op_name} between two IntervalIndex " - "objects that have compatible dtypes" + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side " + "and have compatible dtypes" ) return method(self, other, sort) @@ -239,7 +233,6 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result._data = array result.name = name result._cache = {} - result._no_setting_name = False result._reset_identity() return result @@ -327,19 +320,6 @@ def from_tuples( # -------------------------------------------------------------------- - @Appender(Index._shallow_copy.__doc__) - def _shallow_copy( - self, values: Optional[IntervalArray] = None, name: Label = lib.no_default - ): - name = self.name if name is lib.no_default else name - - if values is not None: - return self._simple_new(values, name=name) - - result = self._simple_new(self._data, name=name) - result._cache = self._cache - return result - @cache_readonly def _engine(self): left = self._maybe_convert_i8(self.left) @@ -380,11 +360,6 @@ def values(self) -> IntervalArray: """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result @@ -398,9 +373,7 @@ def __reduce__(self): def astype(self, dtype, copy: bool = True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) - if is_interval_dtype(new_values.dtype): - return self._shallow_copy(new_values) - return Index.astype(self, dtype, copy=copy) + return Index(new_values, dtype=new_values.dtype, name=self.name) @property def inferred_type(self) -> str: @@ -506,7 +479,7 @@ def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An - Interval-like requires conversion if it's endpoints are one of the + Interval-like requires conversion if its endpoints are one of the aforementioned types. Assumes that any list-like data has already been cast to an Index. @@ -528,7 +501,7 @@ def _needs_i8_conversion(self, key) -> bool: def _maybe_convert_i8(self, key): """ - Maybe convert a given key to it's equivalent i8 value(s). Used as a + Maybe convert a given key to its equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. @@ -567,7 +540,7 @@ def _maybe_convert_i8(self, key): # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: - # convert NaT from it's i8 value to np.nan so it's not viewed + # convert NaT from its i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) @@ -582,17 +555,6 @@ def _maybe_convert_i8(self, key): return key_i8 - def _check_method(self, method): - if method is None: - return - - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for IntervalIndex" - ) - - raise ValueError("Invalid fill method") - def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: raise KeyError( @@ -663,7 +625,7 @@ def get_loc( >>> index.get_loc(pd.Interval(0, 1)) 0 """ - self._check_method(method) + self._check_indexing_method(method) if not is_scalar(key): raise InvalidIndexError(key) @@ -714,7 +676,7 @@ def get_indexer( tolerance: Optional[Any] = None, ) -> np.ndarray: - self._check_method(method) + self._check_indexing_method(method) if self.is_overlapping: raise InvalidIndexError( @@ -729,11 +691,8 @@ def get_indexer( if self.equals(target_as_index): return np.arange(len(self), dtype="intp") - # different closed or incompatible subtype -> no matches - common_subtype = find_common_type( - [self.dtype.subtype, target_as_index.dtype.subtype] - ) - if self.closed != target_as_index.closed or is_object_dtype(common_subtype): + if self._is_non_comparable_own_type(target_as_index): + # different closed or incompatible subtype -> no matches return np.repeat(np.intp(-1), len(target_as_index)) # non-overlapping -> at most one match per interval in target_as_index @@ -753,17 +712,7 @@ def get_indexer( indexer = self._engine.get_indexer(target_as_index.values) else: # heterogeneous scalar index: defer elementwise to get_loc - # (non-overlapping so get_loc guarantees scalar of KeyError) - indexer = [] - for key in target_as_index: - try: - loc = self.get_loc(key) - except KeyError: - loc = -1 - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err - indexer.append(loc) + return self._get_indexer_pointwise(target_as_index)[0] return ensure_platform_int(indexer) @@ -775,10 +724,8 @@ def get_indexer_non_unique( # check that target_as_index IntervalIndex is compatible if isinstance(target_as_index, IntervalIndex): - common_subtype = find_common_type( - [self.dtype.subtype, target_as_index.dtype.subtype] - ) - if self.closed != target_as_index.closed or is_object_dtype(common_subtype): + + if self._is_non_comparable_own_type(target_as_index): # different closed or incompatible subtype -> no matches return ( np.repeat(-1, len(target_as_index)), @@ -789,18 +736,8 @@ def get_indexer_non_unique( target_as_index, IntervalIndex ): # target_as_index might contain intervals: defer elementwise to get_loc - indexer, missing = [], [] - for i, key in enumerate(target_as_index): - try: - locs = self.get_loc(key) - if isinstance(locs, slice): - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - indexer.append(locs) - indexer = np.concatenate(indexer) + return self._get_indexer_pointwise(target_as_index) + else: target_as_index = self._maybe_convert_i8(target_as_index) indexer, missing = self._engine.get_indexer_non_unique( @@ -809,6 +746,30 @@ def get_indexer_non_unique( return ensure_platform_int(indexer), ensure_platform_int(missing) + def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]: + """ + pointwise implementation for get_indexer and get_indexer_non_unique. + """ + indexer, missing = [], [] + for i, key in enumerate(target): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + except InvalidIndexError as err: + # i.e. non-scalar key + raise TypeError(key) from err + + indexer.append(locs) + + indexer = np.concatenate(indexer) + return ensure_platform_int(indexer), ensure_platform_int(missing) + @property def _index_as_unique(self): return not self.is_overlapping @@ -845,10 +806,20 @@ def _convert_list_indexer(self, keyarr): # we have missing values if (locs == -1).any(): - raise KeyError + raise KeyError(keyarr[locs == -1].tolist()) return locs + def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: + # different closed or incompatible subtype -> no matches + + # TODO: once closed is part of IntervalDtype, we can just define + # is_comparable_dtype GH#19371 + if self.closed != other.closed: + return True + common_subtype = find_common_type([self.dtype.subtype, other.dtype.subtype]) + return is_object_dtype(common_subtype) + # -------------------------------------------------------------------- @cache_readonly @@ -867,6 +838,22 @@ def mid(self): def length(self): return Index(self._data.length, copy=False) + def putmask(self, mask, value): + arr = self._data.copy() + try: + value_left, value_right = arr._validate_setitem_value(value) + except (ValueError, TypeError): + return self.astype(object).putmask(mask, value) + + if isinstance(self._data._left, np.ndarray): + np.putmask(arr._left, mask, value_left) + np.putmask(arr._right, mask, value_right) + else: + # TODO: special case not needed with __array_function__ + arr._left.putmask(mask, value_left) + arr._right.putmask(mask, value_right) + return type(self)._simple_new(arr, name=self.name) + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: @@ -885,8 +872,8 @@ def delete(self, loc): """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) - result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) - return self._shallow_copy(result) + result = self._data._shallow_copy(new_left, new_right) + return type(self)._simple_new(result, name=self.name) def insert(self, loc, item): """ @@ -907,8 +894,8 @@ def insert(self, loc, item): new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) - result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) - return self._shallow_copy(result) + result = self._data._shallow_copy(new_left, new_right) + return type(self)._simple_new(result, name=self.name) # -------------------------------------------------------------------- # Rendering Methods @@ -966,11 +953,6 @@ def _format_space(self) -> str: space = " " * (len(type(self).__name__) + 1) return f"\n{space}" - # -------------------------------------------------------------------- - - def argsort(self, *args, **kwargs) -> np.ndarray: - return np.lexsort((self.right, self.left)) - # -------------------------------------------------------------------- # Set Operations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5790c6db6405f..9b4b459d9a122 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -893,6 +893,15 @@ def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.nlevels + 3 """ return len(self._levels) @@ -900,6 +909,15 @@ def nlevels(self) -> int: def levshape(self): """ A tuple with the length of each level. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.levshape + (1, 1, 1) """ return tuple(len(x) for x in self.levels) @@ -1045,7 +1063,7 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): def _engine(self): # Calculate the number of bits needed to represent labels in each # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] @@ -1065,34 +1083,19 @@ def _engine(self): @property def _constructor(self): - return MultiIndex.from_tuples + return type(self).from_tuples @doc(Index._shallow_copy) - def _shallow_copy( - self, - values=None, - name=lib.no_default, - levels=None, - codes=None, - sortorder=None, - names=lib.no_default, - ): - if names is not lib.no_default and name is not lib.no_default: - raise TypeError("Can only provide one of `names` and `name`") - elif names is lib.no_default: - names = name if name is not lib.no_default else self.names + def _shallow_copy(self, values=None, name=lib.no_default): + names = name if name is not lib.no_default else self.names if values is not None: - assert levels is None and codes is None - return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) - - levels = levels if levels is not None else self.levels - codes = codes if codes is not None else self.codes + return type(self).from_tuples(values, sortorder=None, names=names) - result = MultiIndex( - levels=levels, - codes=codes, - sortorder=sortorder, + result = type(self)( + levels=self.levels, + codes=self.codes, + sortorder=None, names=names, verify_integrity=False, ) @@ -1100,18 +1103,6 @@ def _shallow_copy( result._cache.pop("levels", None) # GH32669 return result - def symmetric_difference(self, other, result_name=None, sort=None): - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - tups = Index.symmetric_difference(self, other, result_name, sort) - if len(tups) == 0: - return MultiIndex( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=tups.name, - ) - return type(self).from_tuples(tups, names=tups.name) - # -------------------------------------------------------------------- def copy( @@ -1177,12 +1168,18 @@ def copy( if codes is None: codes = deepcopy(self.codes) - new_index = self._shallow_copy( + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + new_index = type(self)( levels=levels, codes=codes, - names=names, sortorder=self.sortorder, + names=names, + verify_integrity=False, ) + new_index._cache = self._cache.copy() + new_index._cache.pop("levels", None) # GH32669 if dtype: warnings.warn( @@ -1220,10 +1217,10 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ - def f(l): - return "mixed" in l or "string" in l or "unicode" in l + def f(level): + return "mixed" in level or "string" in level or "unicode" in level - return any(f(l) for l in self._inferred_type_levels) + return any(f(level) for level in self._inferred_type_levels) @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: @@ -1457,7 +1454,22 @@ def _set_names(self, names, level=None, validate=True): self._reset_cache() names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" + fset=_set_names, + fget=_get_names, + doc=""" + Names of levels in MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + >>> mi.names + FrozenList(['x', 'y', 'z']) + """, ) # -------------------------------------------------------------------- @@ -1701,6 +1713,32 @@ def to_frame(self, index=True, name=None): -------- DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d """ from pandas import DataFrame @@ -2105,7 +2143,7 @@ def drop(self, codes, level=None, errors="raise"): Parameters ---------- codes : array-like - Must be a list of tuples + Must be a list of tuples when level is not specified level : int or level name, default None errors : str, default 'raise' @@ -2156,10 +2194,17 @@ def _drop_from_level(self, codes, level, errors="raise"): i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) - + # If nan should be dropped it will equal -1 here. We have to check which values + # are not nan and equal -1, this means they are missing in the index + nan_codes = isna(codes) + values[(np.equal(nan_codes, False)) & (values == -1)] = -2 + if index.shape[0] == self.shape[0]: + values[np.equal(nan_codes, True)] = -2 + + not_found = codes[values == -2] + if len(not_found) != 0 and errors != "ignore": + raise KeyError(f"labels {not_found} not found in level") mask = ~algos.isin(self.codes[i], values) - if mask.all() and errors != "ignore": - raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2234,6 +2279,24 @@ def reorder_levels(self, order): Returns ------- MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.reorder_levels(order=[1, 0]) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + + >>> mi.reorder_levels(order=['y', 'x']) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: @@ -2251,7 +2314,7 @@ def reorder_levels(self, order): def _get_codes_for_sorting(self): """ - we categorizing our codes by using the + we are categorizing our codes by using the available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not @@ -2292,6 +2355,34 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Resulting index. indexer : np.ndarray Indices of output values in original index. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) + >>> mi + MultiIndex([(0, 2), + (0, 1)], + ) + + >>> mi.sortlevel() + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(sort_remaining=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + + >>> mi.sortlevel(1) + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(1, ascending=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) """ if isinstance(level, (str, int)): level = [level] @@ -2676,9 +2767,17 @@ def _partial_tup_index(self, tup, side="left"): return start + section.searchsorted(loc, side=side) idx = self._get_loc_single_level_index(lev, lab) - if k < n - 1: + if isinstance(idx, slice) and k < n - 1: + # Get start and end value from slice, necessary when a non-integer + # interval is given as input GH#37707 + start = idx.start + end = idx.stop + elif k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") + elif isinstance(idx, slice): + idx = idx.start + return start + section.searchsorted(idx, side=side) else: return start + section.searchsorted(idx, side=side) @@ -3014,6 +3113,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): start = 0 if key.stop is not None: stop = level_index.get_loc(key.stop) + elif isinstance(start, slice): + stop = len(level_index) else: stop = len(level_index) - 1 step = key.step @@ -3048,22 +3149,27 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = self._get_loc_single_level_index(level_index, key) + idx = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(level_codes == code, dtype=bool, copy=False) + locs = np.array(level_codes == idx, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - i = level_codes.searchsorted(code, side="left") - j = level_codes.searchsorted(code, side="right") - if i == j: + if isinstance(idx, slice): + start = idx.start + end = idx.stop + else: + start = level_codes.searchsorted(idx, side="left") + end = level_codes.searchsorted(idx, side="right") + + if start == end: # The label is present in self.levels[level] but unused: raise KeyError(key) - return slice(i, j) + return slice(start, end) def get_locs(self, seq): """ @@ -3128,19 +3234,26 @@ def _convert_to_indexer(r) -> Int64Index: r = r.nonzero()[0] return Int64Index(r) - def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: + def _update_indexer( + idxr: Optional[Index], indexer: Optional[Index], key + ) -> Index: if indexer is None: indexer = Index(np.arange(n)) if idxr is None: return indexer - return indexer.intersection(idxr) + indexer_intersection = indexer.intersection(idxr) + if indexer_intersection.empty and not idxr.empty and not indexer.empty: + raise KeyError(key) + return indexer_intersection for i, k in enumerate(seq): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer(k), indexer=indexer, key=seq + ) elif is_list_like(k): # a collection of labels to include from this level (these @@ -3152,7 +3265,7 @@ def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: self._get_level_indexer(x, level=i, indexer=indexer) ) indexers = (idxrs if indexers is None else indexers).union( - idxrs + idxrs, sort=False ) except KeyError: @@ -3160,14 +3273,14 @@ def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer) + indexer = _update_indexer(indexers, indexer=indexer, key=seq) else: # no matches we are done return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer) + indexer = _update_indexer(None, indexer=indexer, key=seq) elif isinstance(k, slice): @@ -3177,6 +3290,7 @@ def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: self._get_level_indexer(k, level=i, indexer=indexer) ), indexer=indexer, + key=seq, ) else: # a single label @@ -3185,6 +3299,7 @@ def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: self.get_loc_level(k, level=i, drop_level=False)[0] ), indexer=indexer, + key=seq, ) # empty indexer @@ -3237,6 +3352,9 @@ def _reorder_indexer( # order they appears in a list-like sequence # This mapping is then use to reorder the indexer for i, k in enumerate(seq): + if is_scalar(k): + # GH#34603 we want to treat a scalar the same as an all equal list + k = [k] if com.is_bool_indexer(k): new_order = np.arange(n)[indexer] elif is_list_like(k): @@ -3250,6 +3368,9 @@ def _reorder_indexer( key_order_map[level_indexer] = np.arange(len(level_indexer)) new_order = key_order_map[self.codes[i][indexer]] + elif isinstance(k, slice) and k.start is None and k.stop is None: + # slice(None) should not determine order GH#31330 + new_order = np.ones((n,))[indexer] else: # For all other case, use the same order as the level new_order = np.arange(n)[indexer] @@ -3308,21 +3429,19 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False + if len(self) != len(other): + return False + if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): # other cannot contain tuples, so cannot match self return False - elif len(self) != len(other): - return False return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False - if len(self) != len(other): - return False - for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] @@ -3610,6 +3729,18 @@ def _convert_can_do_setop(self, other): return other, result_names + def symmetric_difference(self, other, result_name=None, sort=None): + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + tups = Index.symmetric_difference(self, other, result_name, sort) + if len(tups) == 0: + return type(self)( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=tups.name, + ) + return type(self).from_tuples(tups, names=tups.name) + # -------------------------------------------------------------------- @doc(Index.astype) @@ -3627,7 +3758,7 @@ def astype(self, dtype, copy=True): return self._shallow_copy() return self - def _validate_insert_value(self, item): + def _validate_fill_value(self, item): if not isinstance(item, tuple): # Pad the key with empty strings if lower levels of the key # aren't specified: @@ -3650,7 +3781,7 @@ def insert(self, loc: int, item): ------- new_index : Index """ - item = self._validate_insert_value(item) + item = self._validate_fill_value(item) new_levels = [] new_codes = [] diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 9eb8a8b719d41..12f61fc44582d 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,11 +1,11 @@ -import operator from typing import Any +import warnings import numpy as np from pandas._libs import index as libindex, lib from pandas._typing import Dtype, Label -from pandas.util._decorators import cache_readonly, doc +from pandas.util._decorators import doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -26,7 +26,6 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name @@ -121,8 +120,14 @@ def _validate_fill_value(self, value): # force conversion to object # so we don't lose the bools raise TypeError - if isinstance(value, str): + elif isinstance(value, str) or lib.is_complex(value): raise TypeError + elif is_scalar(value) and isna(value): + if is_valid_nat_for_dtype(value, self.dtype): + value = self._na_value + else: + # NaT, np.datetime64("NaT"), np.timedelta64("NaT") + raise TypeError return value @@ -161,13 +166,10 @@ def _is_all_dates(self) -> bool: @doc(Index.insert) def insert(self, loc: int, item): - # treat NA values as nans: - if is_scalar(item) and isna(item): - if is_valid_nat_for_dtype(item, self.dtype): - item = self._na_value - else: - # NaT, np.datetime64("NaT"), np.timedelta64("NaT") - return self.astype(object).insert(loc, item) + try: + item = self._validate_fill_value(item) + except TypeError: + return self.astype(object).insert(loc, item) return super().insert(loc, item) @@ -188,18 +190,6 @@ def _union(self, other, sort): else: return super()._union(other, sort) - def _cmp_method(self, other, op): - if self.is_(other): # fastpath - if op in {operator.eq, operator.le, operator.ge}: - arr = np.ones(len(self), dtype=bool) - if self._can_hold_na: - arr[self.isna()] = False - return arr - elif op in {operator.ne, operator.lt, operator.gt}: - return np.zeros(len(self), dtype=bool) - - return super()._cmp_method(other, op) - _num_index_shared_docs[ "class_descr" @@ -243,6 +233,20 @@ class IntegerIndex(NumericIndex): """ _default_dtype: np.dtype + _can_hold_na = False + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented with matching signed-ness. + """ + if data.dtype.kind != cls._default_dtype.kind: + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype def __contains__(self, key) -> bool: """ @@ -266,6 +270,11 @@ def inferred_type(self) -> str: @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak + warnings.warn( + "Index.asi8 is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) return self._values.view(self._default_dtype) @@ -273,23 +282,9 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args _typ = "int64index" - _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented as ints. - """ - if not issubclass(data.dtype.type, np.signedinteger): - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _can_union_without_object_cast(self, other) -> bool: - # See GH#26778, further casting may occur in NumericIndex._union - return other.dtype == "f8" or other.dtype == self.dtype - _uint64_descr_args = dict( klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" @@ -300,7 +295,6 @@ class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args _typ = "uint64index" - _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) @@ -319,21 +313,6 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) - # ---------------------------------------------------------------- - - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented as uints. - """ - if not issubclass(data.dtype.type, np.unsignedinteger): - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _can_union_without_object_cast(self, other) -> bool: - # See GH#26778, further casting may occur in NumericIndex._union - return other.dtype == "f8" or other.dtype == self.dtype - _float64_descr_args = dict( klass="Float64Index", dtype="float64", ltype="float", extra="" @@ -345,7 +324,7 @@ class Float64Index(NumericIndex): _typ = "float64index" _engine_type = libindex.Float64Engine - _default_dtype = np.float64 + _default_dtype = np.dtype(np.float64) @property def inferred_type(self) -> str: @@ -424,16 +403,6 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @cache_readonly - def is_unique(self) -> bool: - return super().is_unique and self._nan_idxs.size < 2 - - @doc(Index.isin) - def isin(self, values, level=None): - if level is not None: - self._validate_index_level(level) - return algorithms.isin(np.array(self), values) - def _can_union_without_object_cast(self, other) -> bool: # See GH#26778, further casting may occur in NumericIndex._union return is_numeric_dtype(other.dtype) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 44c20ad0de848..5dff07ee4c6dd 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,13 +1,13 @@ from datetime import datetime, timedelta -from typing import Any +from typing import Any, cast +import warnings import numpy as np -from pandas._libs import index as libindex -from pandas._libs.lib import no_default +from pandas._libs import index as libindex, lib from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -65,7 +65,7 @@ def _new_PeriodIndex(cls, **d): wrap=True, ) @inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) -class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): +class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. @@ -146,6 +146,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _data: PeriodArray freq: BaseOffset + _data_cls = PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -244,49 +245,12 @@ def __new__( return cls._simple_new(data, name=name) - @classmethod - def _simple_new(cls, values: PeriodArray, name: Label = None): - """ - Create a new PeriodIndex. - - Parameters - ---------- - values : PeriodArray - Values that can be converted to a PeriodArray without inference - or coercion. - """ - assert isinstance(values, PeriodArray), type(values) - - result = object.__new__(cls) - result._data = values - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - result.name = name - result._cache = {} - result._reset_identity() - return result - # ------------------------------------------------------------------------ # Data @property def values(self) -> np.ndarray: - return np.asarray(self) - - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - - def _shallow_copy(self, values=None, name: Label = no_default): - name = name if name is not no_default else self.name - - if values is not None: - return self._simple_new(values, name=name) - - result = self._simple_new(self._data, name=name) - result._cache = self._cache - return result + return np.asarray(self, dtype=object) def _maybe_convert_timedelta(self, other): """ @@ -339,10 +303,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object)._values - @property - def _formatter_func(self): - return self.array._formatter(boxed=False) - # ------------------------------------------------------------------------ # Indexing @@ -417,15 +377,26 @@ def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: return super().asof_locs(where, mask) @doc(Index.astype) - def astype(self, dtype, copy: bool = True, how="start"): + def astype(self, dtype, copy: bool = True, how=lib.no_default): dtype = pandas_dtype(dtype) + if how is not lib.no_default: + # GH#37982 + warnings.warn( + "The 'how' keyword in PeriodIndex.astype is deprecated and " + "will be removed in a future version. " + "Use index.to_timestamp(how=how) instead", + FutureWarning, + stacklevel=2, + ) + else: + how = "start" + if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) - # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) @property @@ -465,8 +436,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ) # _assert_can_do_setop ensures we have matching dtype - result = Int64Index.join( - self, + result = super().join( other, how=how, level=level, @@ -608,10 +578,9 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like - # TODO: we need tests for this case - raise KeyError(label) from err + raise self._invalid_indexer("slice", label) from err elif is_integer(label) or is_float(label): - self._invalid_indexer("slice", label) + raise self._invalid_indexer("slice", label) return label @@ -694,7 +663,10 @@ def difference(self, other, sort=None): if self.equals(other): # pass an empty PeriodArray with the appropriate dtype - return type(self)._simple_new(self._data[:0], name=self.name) + + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + values = cast(PeriodArray, self._data[:0]) + return type(self)._simple_new(values, name=self.name) if is_object_dtype(other): return self.astype(object).difference(other).astype(self.dtype) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 4b8207331838e..669bf115df104 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List +from typing import Any, List, Optional, Tuple import warnings import numpy as np @@ -29,7 +29,7 @@ from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name -from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.numeric import Float64Index, Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer _empty_range = range(0) @@ -397,6 +397,8 @@ def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is not None: + if values.dtype.kind == "f": + return Float64Index(values, name=name) return Int64Index._simple_new(values, name=name) result = self._simple_new(self._range, name=name) @@ -459,6 +461,16 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) + def factorize( + self, sort: bool = False, na_sentinel: Optional[int] = -1 + ) -> Tuple[np.ndarray, "RangeIndex"]: + codes = np.arange(len(self), dtype=np.intp) + uniques = self + if sort and self.step < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. @@ -658,13 +670,17 @@ def difference(self, other, sort=None): if not isinstance(overlap, RangeIndex): # We wont end up with RangeIndex, so fall back return super().difference(other, sort=sort) + if overlap.step != first.step: + # In some cases we might be able to get a RangeIndex back, + # but not worth the effort. + return super().difference(other, sort=sort) if overlap[0] == first.start: # The difference is everything after the intersection new_rng = range(overlap[-1] + first.step, first.stop, first.step) - elif overlap[-1] == first.stop: + elif overlap[-1] == first[-1]: # The difference is everything before the intersection - new_rng = range(first.start, overlap[0] - first.step, first.step) + new_rng = range(first.start, overlap[0], first.step) else: # The difference is not range-like return super().difference(other, sort=sort) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index cf5fa4bbb3d75..fcab3e1f6a0a4 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,7 +2,7 @@ from pandas._libs import index as libindex, lib from pandas._libs.tslibs import Timedelta, to_offset -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import doc @@ -103,6 +103,7 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _typ = "timedeltaindex" + _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine _comparables = ["name", "freq"] @@ -156,29 +157,6 @@ def __new__( ) return cls._simple_new(tdarr, name=name) - @classmethod - def _simple_new(cls, values: TimedeltaArray, name: Label = None): - assert isinstance(values, TimedeltaArray) - - result = object.__new__(cls) - result._data = values - result._name = name - result._cache = {} - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - - result._reset_identity() - return result - - # ------------------------------------------------------------------- - # Rendering Methods - - @property - def _formatter_func(self): - from pandas.io.formats.format import get_format_timedelta64 - - return get_format_timedelta64(self, box=True) - # ------------------------------------------------------------------- @doc(Index.astype) @@ -245,15 +223,12 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): else: return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") elif not isinstance(label, self._data._recognized_scalars): - self._invalid_indexer("slice", label) + raise self._invalid_indexer("slice", label) return label # ------------------------------------------------------------------- - def is_type_compatible(self, typ) -> bool: - return typ == self.inferred_type or typ == "timedelta" - @property def inferred_type(self) -> str: return "timedelta64" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e0bf43d3a0140..6aa031af64833 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -667,6 +667,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if k not in self.obj: if value is None: self.obj[k] = np.nan + elif is_array_like(value) and value.ndim == 2: + # GH#37964 have to select columnwise in case of array + self.obj[k] = value[:, i] elif is_list_like(value): self.obj[k] = value[i] else: @@ -681,7 +684,7 @@ def __setitem__(self, key, value): self._has_valid_setitem_indexer(key) iloc = self if self.name == "iloc" else self.obj.iloc - iloc._setitem_with_indexer(indexer, value) + iloc._setitem_with_indexer(indexer, value, self.name) def _validate_key(self, key, axis: int): """ @@ -1018,7 +1021,7 @@ def _multi_take(self, tup: Tuple): def _getitem_iterable(self, key, axis: int): """ - Index current object with an an iterable collection of keys. + Index current object with an iterable collection of keys. Parameters ---------- @@ -1246,9 +1249,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer( - keyarr, indexer, axis, raise_missing=raise_missing - ) + # _validate_read_indexer is a no-op if no -1s, so skip return ax[indexer], indexer if ax._index_as_unique: @@ -1309,21 +1310,15 @@ def _validate_read_indexer( not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - # we skip the warning on Categorical - # as this check is actually done (check for - # non-missing values), but a bit later in the - # code, so we want to avoid warning & then - # just raising - if not ax.is_categorical(): - not_found = key[missing_mask] - - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) @@ -1525,7 +1520,7 @@ def _get_setitem_indexer(self, key): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value): + def _setitem_with_indexer(self, indexer, value, name="iloc"): """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -1601,7 +1596,7 @@ def _setitem_with_indexer(self, indexer, value): new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes ) - self._setitem_with_indexer(new_indexer, value) + self._setitem_with_indexer(new_indexer, value, name) return @@ -1632,11 +1627,11 @@ def _setitem_with_indexer(self, indexer, value): # align and set the values if take_split_path: # We have to operate column-wise - self._setitem_with_indexer_split_path(indexer, value) + self._setitem_with_indexer_split_path(indexer, value, name) else: - self._setitem_single_block(indexer, value) + self._setitem_single_block(indexer, value, name) - def _setitem_with_indexer_split_path(self, indexer, value): + def _setitem_with_indexer_split_path(self, indexer, value, name: str): """ Setitem column-wise. """ @@ -1647,81 +1642,82 @@ def _setitem_with_indexer_split_path(self, indexer, value): indexer = _tuplify(self.ndim, indexer) if len(indexer) > self.ndim: raise IndexError("too many indices for array") + if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: + raise ValueError(r"Cannot set values with ndim > 2") - if isinstance(value, ABCSeries): + if isinstance(value, ABCSeries) and name != "iloc": value = self._align_series(indexer, value) # Ensure we have something we can iterate over - ilocs = self._ensure_iterable_column_indexer(indexer[1]) + info_axis = indexer[1] + ilocs = self._ensure_iterable_column_indexer(info_axis) - plane_indexer = indexer[:1] - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + pi = indexer[0] + lplane_indexer = length_of_indexer(pi, self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] - if len(ilocs) == 1: - # We can operate on a single column - - # require that we are setting the right number of values that - # we are indexing - if is_list_like_indexer(value) and 0 != lplane_indexer != len(value): - # Exclude zero-len for e.g. boolean masking that is all-false - raise ValueError( - "cannot set using a multi-index " - "selection indexer with a different " - "length than the value" - ) - # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: - # we have an equal len Frame if isinstance(value, ABCDataFrame): - self._setitem_with_indexer_frame_value(indexer, value) + self._setitem_with_indexer_frame_value(indexer, value, name) - # we have an equal len ndarray/convertible to our ilocs - # hasattr first, to avoid coercing to ndarray without reason. - # But we may be relying on the ndarray coercion to check ndim. - # Why not just convert to an ndarray earlier on if needed? elif np.ndim(value) == 2: self._setitem_with_indexer_2d_value(indexer, value) - elif ( - len(ilocs) == 1 - and lplane_indexer == len(value) - and not is_scalar(plane_indexer[0]) - ): - # we have an equal len list/ndarray - # We only get here with len(ilocs) == 1 - self._setitem_single_column(ilocs[0], value, plane_indexer) + elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): + # We are setting multiple rows in a single column. + self._setitem_single_column(ilocs[0], value, pi) + + elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): + # We are trying to set N values into M entries of a single + # column, which is invalid for N != M + # Exclude zero-len for e.g. boolean masking that is all-false + + if len(value) == 1 and not is_integer(info_axis): + # This is a case like df.iloc[:3, [1]] = [0] + # where we treat as df.iloc[:3, 1] = 0 + return self._setitem_with_indexer((pi, info_axis[0]), value[0]) + + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask pass + elif len(ilocs) == len(value): + # We are setting multiple columns in a single row. + for loc, v in zip(ilocs, value): + self._setitem_single_column(loc, v, pi) + + elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: + # This is a setitem-with-expansion, see + # test_loc_setitem_empty_append_expands_rows_mixed_dtype + # e.g. df = DataFrame(columns=["x", "y"]) + # df["x"] = df["x"].astype(np.int64) + # df.loc[:, "x"] = [1, 2, 3] + self._setitem_single_column(ilocs[0], value, pi) + else: - # per-label values - if len(ilocs) != len(value): - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, plane_indexer) else: - if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: - raise ValueError(r"Cannot set values with ndim > 2") - # scalar value for loc in ilocs: - self._setitem_single_column(loc, value, plane_indexer) + self._setitem_single_column(loc, value, pi) def _setitem_with_indexer_2d_value(self, indexer, value): # We get here with np.ndim(value) == 2, excluding DataFrame, # which goes through _setitem_with_indexer_frame_value - plane_indexer = indexer[:1] + pi = indexer[0] ilocs = self._ensure_iterable_column_indexer(indexer[1]) @@ -1734,19 +1730,25 @@ def _setitem_with_indexer_2d_value(self, indexer, value): for i, loc in enumerate(ilocs): # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), plane_indexer) + self._setitem_single_column(loc, value[:, i].tolist(), pi) - def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"): + def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: str): ilocs = self._ensure_iterable_column_indexer(indexer[1]) sub_indexer = list(indexer) - plane_indexer = indexer[:1] + pi = indexer[0] multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) unique_cols = value.columns.is_unique - if not unique_cols and value.columns.equals(self.obj.columns): + # We do not want to align the value in case of iloc GH#37728 + if name == "iloc": + for i, loc in enumerate(ilocs): + val = value.iloc[:, i] + self._setitem_single_column(loc, val, pi) + + elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see # test_iloc_setitem_frame_duplicate_columns_multiple_blocks for loc in ilocs: @@ -1761,7 +1763,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"): else: val = np.nan - self._setitem_single_column(loc, val, plane_indexer) + self._setitem_single_column(loc, val, pi) elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") @@ -1777,10 +1779,18 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"): else: val = np.nan - self._setitem_single_column(loc, val, plane_indexer) + self._setitem_single_column(loc, val, pi) def _setitem_single_column(self, loc: int, value, plane_indexer): - # positional setting on column loc + """ + + Parameters + ---------- + loc : int + Indexer for column position + plane_indexer : int, slice, listlike[int] + The indexer we use for setitem along axis=0. + """ pi = plane_indexer ser = self.obj._ixs(loc, axis=1) @@ -1790,21 +1800,18 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # which means essentially reassign to the columns of a # multi-dim object # GH#6149 (null slice), GH#10408 (full bounds) - if isinstance(pi, tuple) and all( - com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) - for idx in pi - ): + if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): ser = value else: # set the item, possibly having a dtype change ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=pi, value=value) + ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) ser._maybe_update_cacher(clear=True) # reset the sliced object if unique self.obj._iset_item(loc, ser) - def _setitem_single_block(self, indexer, value): + def _setitem_single_block(self, indexer, value, name: str): """ _setitem_with_indexer for the case when we have a single Block. """ @@ -1832,14 +1839,13 @@ def _setitem_single_block(self, indexer, value): return indexer = maybe_convert_ix(*indexer) - - if isinstance(value, (ABCSeries, dict)): + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): # TODO(EA): ExtensionBlock.setitem this causes issues with # setting for extensionarrays that store dicts. Need to decide # if it's worth supporting that. value = self._align_series(indexer, Series(value)) - elif isinstance(value, ABCDataFrame): + elif isinstance(value, ABCDataFrame) and name != "iloc": value = self._align_frame(indexer, value) # check for chained assignment @@ -1871,7 +1877,8 @@ def _setitem_with_indexer_missing(self, indexer, value): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - return self._setitem_with_indexer(new_indexer, value) + # We get only here with loc, so can hard code + return self._setitem_with_indexer(new_indexer, value, "loc") # this preserves dtype of the value new_values = Series([value])._values @@ -1942,7 +1949,7 @@ def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False to the locations selected by `indexer` """ if isinstance(indexer, (slice, np.ndarray, list, Index)): - indexer = tuple([indexer]) + indexer = (indexer,) if isinstance(indexer, tuple): @@ -2015,7 +2022,7 @@ def ravel(i): raise ValueError("Incompatible indexer with Series") - def _align_frame(self, indexer, df: ABCDataFrame): + def _align_frame(self, indexer, df: "DataFrame"): is_frame = self.ndim == 2 if isinstance(indexer, tuple): @@ -2081,7 +2088,7 @@ def __getitem__(self, key): # we could have a convertible item here (e.g. Timestamp) if not is_list_like_indexer(key): - key = tuple([key]) + key = (key,) else: raise ValueError("Invalid call for scalar access (getting)!") @@ -2207,9 +2214,10 @@ def convert_to_index_sliceable(obj: "DataFrame", key): try: res = idx._get_string_slice(key) warnings.warn( - "Indexing on datetimelike rows with `frame[string]` is " - "deprecated and will be removed in a future version. " - "Use `frame.loc[string]` instead.", + "Indexing a DataFrame with a datetimelike index using a single " + "string to slice the rows, like `frame[string]`, is deprecated " + "and will be removed in a future version. Use `frame.loc[string]` " + "instead.", FutureWarning, stacklevel=3, ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 92f6bb6f1cbdd..74b5a184df95d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -32,6 +32,7 @@ TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -123,7 +124,16 @@ def _simple_new( obj._mgr_locs = placement return obj - def __init__(self, values, placement, ndim=None): + def __init__(self, values, placement, ndim: int): + """ + Parameters + ---------- + values : np.ndarray or ExtensionArray + placement : BlockPlacement (or castable) + ndim : int + 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + """ + # TODO(EA2D): ndim will be unnecessary with 2D EAs self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = self._maybe_coerce_values(values) @@ -464,7 +474,9 @@ def _split(self) -> List["Block"]: new_blocks.append(nb) return new_blocks - def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: + def split_and_operate( + self, mask, f, inplace: bool, ignore_failures: bool = False + ) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -474,7 +486,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : boolean + inplace : bool + ignore_failures : bool, default False Returns ------- @@ -513,8 +526,16 @@ def make_a_block(nv, ref_loc): v = new_values[i] # need a new block - if m.any(): - nv = f(m, v, i) + if m.any() or m.size == 0: + # Apply our function; we may ignore_failures if this is a + # reduction that is dropping nuisance columns GH#37827 + try: + nv = f(m, v, i) + except TypeError: + if ignore_failures: + continue + else: + raise else: nv = v if inplace else v.copy() @@ -780,10 +801,9 @@ def replace( regex=regex, ) - blocks = self.putmask(mask, value, inplace=inplace) - blocks = extend_blocks( - [b.convert(numeric=False, copy=not inplace) for b in blocks] - ) + blk = self if inplace else self.copy() + blk._putmask_simple(mask, value) + blocks = blk.convert(numeric=False, copy=not inplace) return blocks def _replace_regex( @@ -841,7 +861,15 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ - src_len = len(src_list) - 1 + # Exclude anything that we know we won't contain + pairs = [ + (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + ] + if not len(pairs): + # shortcut, nothing to replace + return [self] if inplace else [self.copy()] + + src_len = len(pairs) - 1 def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: """ @@ -854,15 +882,19 @@ def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: s = maybe_box_datetimelike(s) return compare_or_regex_search(self.values, s, regex, mask) - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(self.values) + if self.is_object: + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) + masks = [comp(s[0], mask, regex) for s in pairs] + else: + # GH#38086 faster if we know we dont need to check for regex + masks = [missing.mask_missing(self.values, s[0]) for s in pairs] - masks = [comp(s, mask, regex) for s in src_list] masks = [_extract_bool_array(x) for x in masks] rb = [self if inplace else self.copy()] - for i, (src, dest) in enumerate(zip(src_list, dest_list)): + for i, (src, dest) in enumerate(pairs): new_rb: List["Block"] = [] for blk in rb: m = masks[i] @@ -1017,10 +1049,15 @@ def _putmask_simple(self, mask: np.ndarray, value: Any): if lib.is_scalar(value) and isinstance(values, np.ndarray): value = convert_scalar_for_putitemlike(value, values.dtype) - if is_list_like(value) and len(value) == len(values): - values[mask] = value[mask] + if self.is_extension or (self.is_object and not lib.is_scalar(value)): + # GH#19266 using np.putmask gives unexpected results with listlike value + if is_list_like(value) and len(value) == len(values): + values[mask] = value[mask] + else: + values[mask] = value else: - values[mask] = value + # GH#37833 np.putmask is more performant than __setitem__ + np.putmask(values, mask, value) def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False @@ -1173,39 +1210,15 @@ def coerce_to_target_dtype(self, other): # don't coerce float/complex to int return self - elif ( - self.is_datetime - or is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - ): - - # not a datetime - if not ( - (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) - and self.is_datetime - ): - return self.astype(object) - - # don't upcast timezone with different timezone or no timezone - mytz = getattr(self.dtype, "tz", None) - othertz = getattr(dtype, "tz", None) - - if not tz_compare(mytz, othertz): - return self.astype(object) - - raise AssertionError( - f"possible recursion in coerce_to_target_dtype: {self} {other}" - ) + elif self.is_datetime or is_datetime64_any_dtype(dtype): + # The is_dtype_equal check above ensures that at most one of + # these two conditions hold, so we must cast to object. + return self.astype(object) elif self.is_timedelta or is_timedelta64_dtype(dtype): - - # not a timedelta - if not (is_timedelta64_dtype(dtype) and self.is_timedelta): - return self.astype(object) - - raise AssertionError( - f"possible recursion in coerce_to_target_dtype: {self} {other}" - ) + # The is_dtype_equal check above ensures that at most one of + # these two conditions hold, so we must cast to object. + return self.astype(object) try: return self.astype(dtype) @@ -1436,6 +1449,7 @@ def where( if values.ndim - 1 == other.ndim and axis == 1: other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: + # TODO(EA2D): not neceesssary with 2D EAs cond = cond.T if not hasattr(cond, "shape"): @@ -1653,7 +1667,7 @@ class ExtensionBlock(Block): values: ExtensionArray - def __init__(self, values, placement, ndim=None): + def __init__(self, values, placement, ndim: int): """ Initialize a non-consolidatable block. @@ -2030,6 +2044,16 @@ class ObjectValuesExtensionBlock(ExtensionBlock): def external_values(self): return self.values.astype(object) + def _can_hold_element(self, element: Any) -> bool: + if is_valid_nat_for_dtype(element, self.dtype): + return True + if isinstance(element, list) and len(element) == 0: + return True + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, self.dtype.type) + return isinstance(element, self.dtype.type) + class NumericBlock(Block): __slots__ = () @@ -2169,7 +2193,9 @@ def diff(self, n: int, axis: int = 0) -> List["Block"]: values = self.array_values().reshape(self.shape) new_values = values - values.shift(n, axis=axis) - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + return [ + TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) + ] def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs @@ -2402,9 +2428,8 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): - - # allow filling with integers to be - # interpreted as nanoseconds + # TODO(EA2D): if we operated on array_values, TDA.fillna would handle + # raising here. if is_integer(value): # Deprecation GH#24694, GH#19233 raise TypeError( @@ -2459,7 +2484,9 @@ def mask_func(mask, values, inplace): values = values.reshape(1, -1) return func(values) - return self.split_and_operate(None, mask_func, False) + return self.split_and_operate( + None, mask_func, False, ignore_failures=ignore_failures + ) try: res = func(values) @@ -2565,7 +2592,7 @@ def _replace_list( regex: bool = False, ) -> List["Block"]: if len(algos.unique(dest_list)) == 1: - # We got likely here by tiling value inside NDFrame.replace, + # We likely got here by tiling value inside NDFrame.replace, # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) return super()._replace_list(src_list, dest_list, inplace, regex) @@ -2619,6 +2646,7 @@ def get_block_type(values, dtype=None): elif is_interval_dtype(dtype) or is_period_dtype(dtype): cls = ObjectValuesExtensionBlock elif is_extension_array_dtype(values.dtype): + # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 205af5354d333..06de1972b4c9a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -82,6 +82,7 @@ def concatenate_block_managers( b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, + ndim=len(axes), ) blocks.append(b) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bcafa2c2fdca7..eefd1a604f894 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -225,7 +225,8 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # TODO: What about re-joining object columns? block_values = [ - make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) + make_block(dvals_list[n], placement=[n], ndim=2) + for n in range(len(dvals_list)) ] else: @@ -369,7 +370,7 @@ def extract_index(data) -> Index: index = Index([]) elif len(data) > 0: raw_lengths = [] - indexes = [] + indexes: List[Union[List[Label], Index]] = [] have_raw_arrays = False have_series = False diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 155d88d6ec2d9..4cd7cc56144d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -33,7 +33,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos @@ -442,6 +442,7 @@ def apply( def quantile( self, axis: int = 0, + consolidate: bool = True, transposed: bool = False, interpolation="linear", qs=None, @@ -455,6 +456,8 @@ def quantile( Parameters ---------- axis: reduction axis, default 0 + consolidate: bool, default True. Join together blocks having same + dtype transposed: bool, default False we are holding transposed data interpolation : type of interpolation, default 'linear' @@ -469,6 +472,9 @@ def quantile( # simplify some of the code here and in the blocks assert self.ndim >= 2 + if consolidate: + self._consolidate_inplace() + def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 @@ -1432,7 +1438,7 @@ def _make_na_block(self, placement, fill_value=None): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement) + return make_block(block_values, placement=placement, ndim=block_values.ndim) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -1544,7 +1550,7 @@ def __init__( ) self.axes = [axis] - self.blocks = tuple([block]) + self.blocks = (block,) @classmethod def from_blocks( @@ -1655,7 +1661,9 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block(values=blocks[0], placement=slice(0, len(axes[0]))) + make_block( + values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 + ) ] mgr = BlockManager(blocks, axes) @@ -1675,8 +1683,11 @@ def create_block_manager_from_arrays( assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) + # ensure we dont have any PandasArrays when we call get_block_type + # Note: just calling extract_array breaks tests that patch PandasArray._typ. + arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: - blocks = form_blocks(arrays, names, axes) + blocks = _form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr @@ -1708,7 +1719,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def form_blocks(arrays, names: Index, axes) -> List[Block]: +def _form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1755,7 +1766,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i) + make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1770,15 +1781,14 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i) + make_block(array, klass=CategoricalBlock, placement=i, ndim=2) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): - external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i) + make_block(array, klass=ExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1786,7 +1796,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1799,7 +1809,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs) + na_block = make_block(block_values, placement=extra_locs, ndim=2) blocks.append(na_block) return blocks @@ -1816,7 +1826,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement) + block = make_block(values, placement=placement, ndim=2) return [block] @@ -1830,7 +1840,7 @@ def _multi_blockify(tuples, dtype=None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement) + block = make_block(values, placement=placement, ndim=2) new_blocks.append(block) return new_blocks @@ -1921,7 +1931,7 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs)] + return [make_block(new_values, placement=new_mgr_locs, ndim=2)] # can't consolidate --> no merge return blocks diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d38974839394d..80c4cd5b44a92 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1646,7 +1646,7 @@ def nanpercentile( interpolation=interpolation, ) - # Note: we have to do do `astype` and not view because in general we + # Note: we have to do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 8142fc3e695a3..c855687552e82 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -27,7 +27,7 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core.ops import missing @@ -40,13 +40,11 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): - # Note: these checks can be for ABCIndex and not ABCIndexClass - # because that is the only object-dtype class. + if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)): if not is_object_dtype(y.dtype): y = y.astype(np.object_) - if isinstance(y, (ABCSeries, ABCIndex)): + if isinstance(y, (ABCSeries, ABCIndexClass)): y = y._values if x.shape != y.shape: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fccedd75c4531..e5589b0dae837 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs): # downsample methods -for method in ["sum", "prod"]: +for method in ["sum", "prod", "min", "max", "first", "last"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs): # downsample methods -for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: +for method in ["mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 77b1076920f20..4a2629daf63d7 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,16 +3,27 @@ """ from collections import abc -from typing import TYPE_CHECKING, Iterable, List, Mapping, Type, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Iterable, + List, + Mapping, + Optional, + Type, + Union, + cast, + overload, +) import numpy as np -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna +import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -295,7 +306,7 @@ class _Concatenator: def __init__( self, - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join: str = "outer", keys=None, @@ -366,7 +377,7 @@ def __init__( # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample = None + sample: Optional["NDFrame"] = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -436,6 +447,8 @@ def __init__( # to line up if self._is_frame and axis == 1: name = 0 + # mypy needs to know sample is not an NDFrame + sample = cast("FrameOrSeriesUnion", sample) obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -501,6 +514,13 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): + # We have to remove the duplicates from obj_labels + # in new labels to make them unique, otherwise we would + # duplicate or duplicates again + if not obj_labels.is_unique: + new_labels = algos.make_duplicates_of_left_unique_in_right( + np.asarray(obj_labels), np.asarray(new_labels) + ) indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 918a894a27916..3b755c40721fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -5,6 +5,7 @@ import copy import datetime from functools import partial +import hashlib import string from typing import TYPE_CHECKING, Optional, Tuple, cast import warnings @@ -643,6 +644,17 @@ def __init__( self._validate_specification() + cross_col = None + if self.how == "cross": + ( + self.left, + self.right, + self.how, + cross_col, + ) = self._create_cross_configuration(self.left, self.right) + self.left_on = self.right_on = [cross_col] + self._cross = cross_col + # note this function has side effects ( self.left_join_keys, @@ -690,8 +702,14 @@ def get_result(self): self._maybe_restore_index_levels(result) + self._maybe_drop_cross_column(result, self._cross) + return result.__finalize__(self, method="merge") + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): + if cross_col is not None: + result.drop(columns=cross_col, inplace=True) + def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" ) -> Tuple["DataFrame", "DataFrame"]: @@ -1200,9 +1218,50 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + def _create_cross_configuration( + self, left, right + ) -> Tuple["DataFrame", "DataFrame", str, str]: + """ + Creates the configuration to dispatch the cross operation to inner join, + e.g. adding a join column and resetting parameters. Join column is added + to a new object, no inplace modification + + Parameters + ---------- + left: DataFrame + right DataFrame + + Returns + ------- + a tuple (left, right, how, cross_col) representing the adjusted + DataFrames with cross_col, the merge operation set to inner and the column + to join over. + """ + cross_col = f"_cross_{hashlib.md5().hexdigest()}" + how = "inner" + return ( + left.assign(**{cross_col: 1}), + right.assign(**{cross_col: 1}), + how, + cross_col, + ) + def _validate_specification(self): + if self.how == "cross": + if ( + self.left_index + or self.right_index + or self.right_on is not None + or self.left_on is not None + or self.on is not None + ): + raise MergeError( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + return # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: + elif self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () @@ -1266,7 +1325,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): + if self.how != "cross" and len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): @@ -1358,12 +1417,14 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how == "left": + if how in ("left", "right"): kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, - "right": _right_outer_join, + "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( + y, x, count, **kwargs + )[::-1], "outer": libjoin.full_outer_join, }[how] @@ -1883,11 +1944,6 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer -def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - return left_indexer, right_indexer - - def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" ) -> Tuple[np.ndarray, np.ndarray, int]: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 18ebe14763797..c197e142fecbc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -137,7 +137,7 @@ def _indexer_and_to_sort(self): @cache_readonly def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort - return [l.take(indexer) for l in to_sort] + return [line.take(indexer) for line in to_sort] def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort @@ -399,6 +399,7 @@ def _unstack_multiple(data, clocs, fill_value=None): def unstack(obj, level, fill_value=None): + if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, @@ -416,6 +417,13 @@ def unstack(obj, level, fill_value=None): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) + elif not isinstance(obj.index, MultiIndex): + # GH 36113 + # Give nicer error messages when unstack a Series whose + # Index is not a MultiIndex. + raise ValueError( + f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" + ) else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) @@ -513,7 +521,7 @@ def factorize(index): verify_integrity=False, ) - if frame._is_homogeneous_type: + if not frame.empty and frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) diff --git a/pandas/core/series.py b/pandas/core/series.py index 800da18142825..d493ac0a8c051 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -176,6 +176,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] @@ -367,7 +368,7 @@ def _init_dict(self, data, index=None, dtype=None): values = na_value_for_dtype(dtype) keys = index else: - keys, values = tuple([]), [] + keys, values = tuple(), [] # Input is now list-like, so rely on "standard" construction: @@ -683,81 +684,6 @@ def view(self, dtype=None) -> "Series": # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - # TODO: handle DataFrame - cls = type(self) - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - types = tuple(type(x) for x in inputs) - # TODO: dataframe - alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - index = alignable[0].index - for s in alignable[1:]: - index = index.union(s.index) - inputs = tuple( - x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types) - ) - else: - index = self.index - - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - - name = names[0] if len(set(names)) == 1 else None - - def construct_return(result): - if lib.is_scalar(result): - return result - elif result.ndim > 1: - # e.g. np.subtract.outer - if method == "outer": - # GH#27198 - raise NotImplementedError - return result - return self._constructor(result, index=index, name=name, copy=False) - - if type(result) is tuple: - # multiple return values - return tuple(construct_return(x) for x in result) - elif method == "at": - # no return value - return None - else: - return construct_return(result) - def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -1015,7 +941,7 @@ def __setitem__(self, key, value): # positional setter values[key] = value else: - # GH#12862 adding an new key to the Series + # GH#12862 adding a new key to the Series self.loc[key] = value except TypeError as err: @@ -1428,6 +1354,7 @@ def to_string( @doc( klass=_shared_doc_kwargs["klass"], + storage_options=generic._shared_docs["storage_options"], examples=dedent( """ Examples @@ -1466,14 +1393,7 @@ def to_markdown( Add index (row) labels. .. versionadded:: 1.1.0 - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -4697,7 +4617,7 @@ def isin(self, values) -> "Series": 5 False Name: animal, dtype: bool """ - result = algorithms.isin(self, values) + result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index cc918c27b5c2e..9de9d1f434a12 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -324,4 +324,67 @@ 0 0.000000 1.000000 1 1.000000 2.718282 2 1.414214 7.389056 + +You can call transform on a GroupBy object: + +>>> df = pd.DataFrame({{ +... "Date": [ +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05", +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"], +... "Data": [5, 8, 6, 1, 50, 100, 60, 120], +... }}) +>>> df + Date Data +0 2015-05-08 5 +1 2015-05-07 8 +2 2015-05-06 6 +3 2015-05-05 1 +4 2015-05-08 50 +5 2015-05-07 100 +6 2015-05-06 60 +7 2015-05-05 120 +>>> df.groupby('Date')['Data'].transform('sum') +0 55 +1 108 +2 66 +3 121 +4 55 +5 108 +6 66 +7 121 +Name: Data, dtype: int64 + +>>> df = pd.DataFrame({{ +... "c": [1, 1, 1, 2, 2, 2, 2], +... "type": ["m", "n", "o", "m", "m", "n", "n"] +... }}) +>>> df + c type +0 1 m +1 1 n +2 1 o +3 2 m +4 2 m +5 2 n +6 2 n +>>> df['size'] = df.groupby('c')['type'].transform(len) +>>> df + c type size +0 1 m 3 +1 1 n 3 +2 1 o 3 +3 2 m 4 +4 2 m 4 +5 2 n 4 +6 2 n 4 """ + +_shared_docs[ + "storage_options" +] = """storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a non-fsspec URL. + See the fsspec and backend storage implementation docs for the set of + allowed keys and values.""" diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2a0da8b0fb35c..729f517c789a7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -610,7 +610,7 @@ def compress_group_index(group_index, sort: bool = True): if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return comp_ids, obs_group_ids + return ensure_int64(comp_ids), ensure_int64(obs_group_ids) def _reorder_by_uniques(uniques, labels): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7d6a2bf1d776d..9d16beba669ca 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -157,11 +157,10 @@ def __init__(self, data): array = data.array self._array = array + self._index = self._name = None if isinstance(data, ABCSeries): self._index = data.index self._name = data.name - else: - self._index = self._name = None # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 32ca83787c4c1..4af32b219d380 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -10,6 +10,7 @@ is_number, is_numeric_dtype, is_scalar, + needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -123,8 +124,9 @@ def to_numeric(arg, errors="raise", downcast=None): values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True - values = arg.asi8 - if values is None: + if needs_i8_conversion(arg.dtype): + values = arg.asi8 + else: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index e8faebd6b2542..6a9fd7a542a44 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -66,6 +66,11 @@ def to_timedelta(arg, unit=None, errors="raise"): to_datetime : Convert argument to datetime. convert_dtypes : Convert dtypes. + Notes + ----- + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds for string inputs. + Examples -------- Parsing a single string to a Timedelta: diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py index 304c61ac0e489..b3d0820fee4da 100644 --- a/pandas/core/window/__init__.py +++ b/pandas/core/window/__init__.py @@ -1,3 +1,6 @@ -from pandas.core.window.ewm import ExponentialMovingWindow # noqa:F401 +from pandas.core.window.ewm import ( # noqa:F401 + ExponentialMovingWindow, + ExponentialMovingWindowGroupby, +) from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 938f1846230cb..6ebf610587d30 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,5 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict +from typing import cast import warnings import numpy as np @@ -109,6 +110,9 @@ def dataframe_from_int_dict(data, frame_template): # set the index and reorder if arg2.columns.nlevels > 1: + # mypy needs to know columns is a MultiIndex, Index doesn't + # have levels attribute + arg2.columns = cast(MultiIndex, arg2.columns) result.index = MultiIndex.from_product( arg2.columns.levels + [result_index] ) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index b601bacec35f1..f8237a436f436 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -14,8 +14,20 @@ from pandas.core.dtypes.common import is_datetime64_ns_dtype import pandas.core.common as common -from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import BaseWindow, flex_binary_moment +from pandas.core.util.numba_ import maybe_use_numba +from pandas.core.window.common import ( + _doc_template, + _shared_docs, + flex_binary_moment, + zsqrt, +) +from pandas.core.window.indexers import ( + BaseIndexer, + ExponentialMovingWindowIndexer, + GroupbyIndexer, +) +from pandas.core.window.numba_ import generate_numba_groupby_ewma_func +from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch if TYPE_CHECKING: from pandas import Series @@ -219,14 +231,16 @@ def __init__( ignore_na: bool = False, axis: int = 0, times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, + **kwargs, ): - self.com: Optional[float] self.obj = obj self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis self.on = None + self.center = False + self.closed = None if times is not None: if isinstance(times, str): times = self._selected_obj[times] @@ -245,7 +259,7 @@ def __init__( if common.count_not_none(com, span, alpha) > 0: self.com = get_center_of_mass(com, span, None, alpha) else: - self.com = None + self.com = 0.0 else: if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): raise ValueError( @@ -260,6 +274,12 @@ def __init__( def _constructor(self): return ExponentialMovingWindow + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + return ExponentialMovingWindowIndexer() + _agg_see_also_doc = dedent( """ See Also @@ -299,27 +319,6 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - def _apply(self, func): - """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. - - Parameters - ---------- - func : str/callable to apply - - Returns - ------- - y : same type as input argument - """ - - def homogeneous_func(values: np.ndarray): - if values.size == 0: - return values.copy() - return np.apply_along_axis(func, self.axis, values) - - return self._apply_blockwise(homogeneous_func) - @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) def mean(self, *args, **kwargs): @@ -336,7 +335,6 @@ def mean(self, *args, **kwargs): window_func = self._get_roll_func("ewma_time") window_func = partial( window_func, - minp=self.min_periods, times=self.times, halflife=self.halflife, ) @@ -347,7 +345,6 @@ def mean(self, *args, **kwargs): com=self.com, adjust=self.adjust, ignore_na=self.ignore_na, - minp=self.min_periods, ) return self._apply(window_func) @@ -371,13 +368,19 @@ def var(self, bias: bool = False, *args, **kwargs): Exponential weighted moving variance. """ nv.validate_window_func("var", args, kwargs) + window_func = self._get_roll_func("ewmcov") + window_func = partial( + window_func, + com=self.com, + adjust=self.adjust, + ignore_na=self.ignore_na, + bias=bias, + ) - def f(arg): - return window_aggregations.ewmcov( - arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias - ) + def var_func(values, begin, end, min_periods): + return window_func(values, begin, end, min_periods, values) - return self._apply(f) + return self._apply(var_func) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) @@ -419,11 +422,13 @@ def _get_cov(X, Y): Y = self._shallow_copy(Y) cov = window_aggregations.ewmcov( X._prep_values(), + np.array([0], dtype=np.int64), + np.array([0], dtype=np.int64), + self.min_periods, Y._prep_values(), self.com, self.adjust, self.ignore_na, - self.min_periods, bias, ) return wrap_result(X, cov) @@ -470,7 +475,15 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1 + x, + np.array([0], dtype=np.int64), + np.array([0], dtype=np.int64), + self.min_periods, + y, + self.com, + self.adjust, + self.ignore_na, + 1, ) x_values = X._prep_values() @@ -485,3 +498,78 @@ def _cov(x, y): return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) + + +class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): + """ + Provide an exponential moving window groupby implementation. + """ + + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + window_indexer = GroupbyIndexer( + groupby_indicies=self._groupby.indices, + window_indexer=ExponentialMovingWindowIndexer, + ) + return window_indexer + + var = dispatch("var", bias=False) + std = dispatch("std", bias=False) + cov = dispatch("cov", other=None, pairwise=None, bias=False) + corr = dispatch("corr", other=None, pairwise=None) + + def mean(self, engine=None, engine_kwargs=None): + """ + Parameters + ---------- + engine : str, default None + * ``'cython'`` : Runs mean through C-extensions from cython. + * ``'numba'`` : Runs mean through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.2.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.2.0 + + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + """ + if maybe_use_numba(engine): + groupby_ewma_func = generate_numba_groupby_ewma_func( + engine_kwargs, + self.com, + self.adjust, + self.ignore_na, + ) + return self._apply( + groupby_ewma_func, + numba_cache_key=(lambda x: x, "groupby_ewma"), + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return x.mean() + + return self._groupby.apply(f) + else: + raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index a8229257bb7bb..a3b9695d777d9 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -344,3 +344,18 @@ def get_window_bounds( start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) return start, end + + +class ExponentialMovingWindowIndexer(BaseIndexer): + """Calculate ewm window bounds (the entire window)""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index c4858b6e5a4ab..274586e1745b5 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -72,3 +72,92 @@ def roll_apply( return result return roll_apply + + +def generate_numba_groupby_ewma_func( + engine_kwargs: Optional[Dict[str, bool]], + com: float, + adjust: bool, + ignore_na: bool, +): + """ + Generate a numba jitted groupby ewma function specified by values + from engine_kwargs. + + Parameters + ---------- + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + com : float + adjust : bool + ignore_na : bool + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + cache_key = (lambda x: x, "groupby_ewma") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba = import_optional_dependency("numba") + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def groupby_ewma( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(values)) + alpha = 1.0 / (1.0 + com) + for i in loop_range(len(begin)): + start = begin[i] + stop = end[i] + window = values[start:stop] + sub_result = np.empty(len(window)) + + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + + weighted_avg = window[0] + nobs = int(not np.isnan(weighted_avg)) + sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan + old_wt = 1.0 + + for j in range(1, len(window)): + cur = window[j] + is_observation = not np.isnan(cur) + nobs += is_observation + if not np.isnan(weighted_avg): + + if is_observation or not ignore_na: + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ( + (old_wt * weighted_avg) + (new_wt * cur) + ) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1.0 + elif is_observation: + weighted_avg = cur + + sub_result[j] = weighted_avg if nobs >= minimum_periods else np.nan + + result[start:stop] = sub_result + + return result + + return groupby_ewma diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f65452cb2f17f..51a1e2102c273 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -337,6 +337,13 @@ def _get_roll_func(self, func_name: str) -> Callable[..., Any]: ) return window_func + @property + def _index_array(self): + # TODO: why do we get here with e.g. MultiIndex? + if needs_i8_conversion(self._on.dtype): + return self._on.asi8 + return None + def _get_window_indexer(self) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds @@ -345,7 +352,7 @@ def _get_window_indexer(self) -> BaseIndexer: return self.window if self.is_freq_type: return VariableWindowIndexer( - index_array=self._on.asi8, window_size=self.window + index_array=self._index_array, window_size=self.window ) return FixedWindowIndexer(window_size=self.window) @@ -405,7 +412,7 @@ def _apply( self, func: Callable[..., Any], name: Optional[str] = None, - use_numba_cache: bool = False, + numba_cache_key: Optional[Tuple[Callable, str]] = None, **kwargs, ): """ @@ -417,9 +424,8 @@ def _apply( ---------- func : callable function to apply name : str, - use_numba_cache : bool - whether to cache a numba compiled function. Only available for numba - enabled methods (so far only apply) + numba_cache_key : tuple + caching key to be used to store a compiled numba func **kwargs additional arguments for rolling function and window function @@ -456,8 +462,8 @@ def calc(x): result = calc(values) result = np.asarray(result) - if use_numba_cache: - NUMBA_FUNC_CACHE[(kwargs["original_func"], "rolling_apply")] = func + if numba_cache_key is not None: + NUMBA_FUNC_CACHE[numba_cache_key] = func return result @@ -715,7 +721,7 @@ def aggregate(self, func, *args, **kwargs): ) -def _dispatch(name: str, *args, **kwargs): +def dispatch(name: str, *args, **kwargs): """ Dispatch to groupby apply. """ @@ -746,20 +752,20 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - corr = _dispatch("corr", other=None, pairwise=None) - cov = _dispatch("cov", other=None, pairwise=None) + corr = dispatch("corr", other=None, pairwise=None) + cov = dispatch("cov", other=None, pairwise=None) def _apply( self, func: Callable[..., Any], name: Optional[str] = None, - use_numba_cache: bool = False, + numba_cache_key: Optional[Tuple[Callable, str]] = None, **kwargs, ) -> FrameOrSeries: result = super()._apply( func, name, - use_numba_cache, + numba_cache_key, **kwargs, ) # Reconstruct the resulting MultiIndex from tuples @@ -1038,7 +1044,7 @@ def _apply( self, func: Callable[[np.ndarray, int, int], np.ndarray], name: Optional[str] = None, - use_numba_cache: bool = False, + numba_cache_key: Optional[Tuple[Callable, str]] = None, **kwargs, ): """ @@ -1050,9 +1056,8 @@ def _apply( ---------- func : callable function to apply name : str, - use_numba_cache : bool - whether to cache a numba compiled function. Only available for numba - enabled methods (so far only apply) + use_numba_cache : tuple + unused **kwargs additional arguments for scipy windows if necessary @@ -1292,10 +1297,12 @@ def apply( if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") + numba_cache_key = None if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs) + numba_cache_key = (func, "rolling_apply") elif engine in ("cython", None): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") @@ -1305,10 +1312,7 @@ def apply( return self._apply( apply_func, - use_numba_cache=maybe_use_numba(engine), - original_func=func, - args=args, - kwargs=kwargs, + numba_cache_key=numba_cache_key, ) def _generate_cython_apply_func( @@ -2143,7 +2147,7 @@ def _get_window_indexer(self) -> GroupbyIndexer: """ rolling_indexer: Type[BaseIndexer] indexer_kwargs: Optional[Dict[str, Any]] = None - index_array = self._on.asi8 + index_array = self._index_array window = self.window if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) diff --git a/pandas/io/common.py b/pandas/io/common.py index 695c1671abd61..8ec0a869c7042 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -468,8 +468,11 @@ def infer_compression( ------ ValueError on invalid compression specified. """ + if compression is None: + return None + # Infer compression - if compression in ("infer", None): + if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 425b1da33dbb9..c519baa4c21da 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -316,33 +316,36 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) - data = io.parse( - sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - dtype=dtype, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - verbose=verbose, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - ) - if should_close: - io.close() + try: + data = io.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + na_filter=na_filter, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + ) + finally: + # make sure to close opened file handles + if should_close: + io.close() return data diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f9a08bf862644..0bea19bec2cdd 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -182,7 +182,7 @@ def _process_style(self, style: Dict[str, Any]) -> str: Returns ------- style_key : str - Unique style key for for later reference in sheet + Unique style key for later reference in sheet """ from odf.style import ( ParagraphProperties, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 9e63976bf8cf9..422677771b4d0 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,12 +4,15 @@ from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc from pandas import DataFrame, Int64Index, RangeIndex +from pandas.core import generic from pandas.io.common import get_handle +@doc(storage_options=generic._shared_docs["storage_options"]) def to_feather( df: DataFrame, path: FilePathOrBuffer[AnyStr], @@ -23,13 +26,7 @@ def to_feather( ---------- df : DataFrame path : string file path, or file-like object - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 @@ -83,6 +80,7 @@ def to_feather( feather.write_feather(df, handles.handle, **kwargs) +@doc(storage_options=generic._shared_docs["storage_options"]) def read_feather( path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None ): @@ -111,13 +109,7 @@ def read_feather( Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. + {storage_options} .. versionadded:: 1.2.0 diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index ab9c9fe995008..ea291bcbfa44c 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -78,7 +78,7 @@ def check_main(): def in_ipython_frontend(): """ - Check if we're inside an an IPython zmq frontend. + Check if we're inside an IPython zmq frontend. Returns ------- diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index cbe2ed1ed838d..fbda78a1842ca 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -144,7 +144,7 @@ def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes - # and make sure sure cols is just a list of labels + # and make sure cols is just a list of labels new_cols = self.obj.columns if isinstance(new_cols, ABCIndexClass): return new_cols._format_native_types(**self._number_format) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index c6179f5c034c7..bded853f383e0 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,18 +5,20 @@ from functools import reduce import itertools import re -from typing import Callable, Dict, Mapping, Optional, Sequence, Union +from typing import Callable, Dict, Iterable, Mapping, Optional, Sequence, Union, cast import warnings import numpy as np +from pandas._libs.lib import is_list_like from pandas._typing import Label, StorageOptions +from pandas.util._decorators import doc from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar -from pandas.core.dtypes.generic import ABCIndex from pandas import DataFrame, Index, MultiIndex, PeriodIndex +from pandas.core import generic import pandas.core.common as com from pandas.io.formats.css import CSSResolver, CSSWarning @@ -29,7 +31,13 @@ class ExcelCell: __slots__ = __fields__ def __init__( - self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + self, + row: int, + col: int, + val, + style=None, + mergestart: Optional[int] = None, + mergeend: Optional[int] = None, ): self.row = row self.col = col @@ -423,7 +431,7 @@ class ExcelFormatter: Format string for floating point numbers cols : sequence, optional Columns to write - header : boolean or list of string, default True + header : boolean or sequence of str, default True Write out column names. If a list of string is given it is assumed to be aliases for the column names index : boolean, default True @@ -522,7 +530,7 @@ def _format_value(self, val): ) return val - def _format_header_mi(self): + def _format_header_mi(self) -> Iterable[ExcelCell]: if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( @@ -530,8 +538,7 @@ def _format_header_mi(self): "index ('index'=False) is not yet implemented." ) - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if not (has_aliases or self.header): + if not (self._has_aliases or self.header): return columns = self.columns @@ -547,28 +554,30 @@ def _format_header_mi(self): if self.merge_cells: # Format multi-index as a merged cells. - for lnum in range(len(level_lengths)): - name = columns.names[lnum] - yield ExcelCell(lnum, coloffset, name, self.header_style) + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=self.header_style, + ) for lnum, (spans, levels, level_codes) in enumerate( zip(level_lengths, columns.levels, columns.codes) ): values = levels.take(level_codes) - for i in spans: - if spans[i] > 1: - yield ExcelCell( - lnum, - coloffset + i + 1, - values[i], - self.header_style, - lnum, - coloffset + i + spans[i], - ) - else: - yield ExcelCell( - lnum, coloffset + i + 1, values[i], self.header_style - ) + for i, span_val in spans.items(): + spans_multiple_cells = span_val > 1 + yield ExcelCell( + row=lnum, + col=coloffset + i + 1, + val=values[i], + style=self.header_style, + mergestart=lnum if spans_multiple_cells else None, + mergeend=( + coloffset + i + span_val if spans_multiple_cells else None + ), + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -577,9 +586,8 @@ def _format_header_mi(self): self.rowcounter = lnum - def _format_header_regular(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_header_regular(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: coloffset = 0 if self.index: @@ -588,17 +596,11 @@ def _format_header_regular(self): coloffset = len(self.df.index[0]) colnames = self.columns - if has_aliases: - # pandas\io\formats\excel.py:593: error: Argument 1 to "len" - # has incompatible type "Union[Sequence[Optional[Hashable]], - # bool]"; expected "Sized" [arg-type] - if len(self.header) != len(self.columns): # type: ignore[arg-type] - # pandas\io\formats\excel.py:602: error: Argument 1 to - # "len" has incompatible type - # "Union[Sequence[Optional[Hashable]], bool]"; expected - # "Sized" [arg-type] + if self._has_aliases: + self.header = cast(Sequence, self.header) + if len(self.header) != len(self.columns): raise ValueError( - f"Writing {len(self.columns)} cols " # type: ignore[arg-type] + f"Writing {len(self.columns)} cols " f"but got {len(self.header)} aliases" ) else: @@ -609,7 +611,7 @@ def _format_header_regular(self): self.rowcounter, colindex + coloffset, colname, self.header_style ) - def _format_header(self): + def _format_header(self) -> Iterable[ExcelCell]: if isinstance(self.columns, MultiIndex): gen = self._format_header_mi() else: @@ -631,15 +633,14 @@ def _format_header(self): self.rowcounter += 1 return itertools.chain(gen, gen2) - def _format_body(self): + def _format_body(self) -> Iterable[ExcelCell]: if isinstance(self.df.index, MultiIndex): return self._format_hierarchical_rows() else: return self._format_regular_rows() - def _format_regular_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_regular_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: self.rowcounter += 1 # output index and index_label? @@ -676,9 +677,8 @@ def _format_regular_rows(self): yield from self._generate_body(coloffset) - def _format_hierarchical_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: self.rowcounter += 1 gcolidx = 0 @@ -721,23 +721,20 @@ def _format_hierarchical_rows(self): fill_value=levels._na_value, ) - for i in spans: - if spans[i] > 1: - yield ExcelCell( - self.rowcounter + i, - gcolidx, - values[i], - self.header_style, - self.rowcounter + i + spans[i] - 1, - gcolidx, - ) - else: - yield ExcelCell( - self.rowcounter + i, - gcolidx, - values[i], - self.header_style, - ) + for i, span_val in spans.items(): + spans_multiple_cells = span_val > 1 + yield ExcelCell( + row=self.rowcounter + i, + col=gcolidx, + val=values[i], + style=self.header_style, + mergestart=( + self.rowcounter + i + span_val - 1 + if spans_multiple_cells + else None + ), + mergeend=gcolidx if spans_multiple_cells else None, + ) gcolidx += 1 else: @@ -745,16 +742,21 @@ def _format_hierarchical_rows(self): for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): yield ExcelCell( - self.rowcounter + idx, - gcolidx, - indexcolval, - self.header_style, + row=self.rowcounter + idx, + col=gcolidx, + val=indexcolval, + style=self.header_style, ) gcolidx += 1 yield from self._generate_body(gcolidx) - def _generate_body(self, coloffset: int): + @property + def _has_aliases(self) -> bool: + """Whether the aliases for column names are present.""" + return is_list_like(self.header) + + def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: if self.styler is None: styles = None else: @@ -771,11 +773,12 @@ def _generate_body(self, coloffset: int): xlstyle = self.style_converter(";".join(styles[i, colidx])) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) - def get_formatted_cells(self): + def get_formatted_cells(self) -> Iterable[ExcelCell]: for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell + @doc(storage_options=generic._shared_docs["storage_options"]) def write( self, writer, @@ -802,10 +805,7 @@ def write( write engine to use if writer is a path - you can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". + {storage_options} .. versionadded:: 1.2.0 """ @@ -818,6 +818,7 @@ def write( f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) + formatted_cells = self.get_formatted_cells() if isinstance(writer, ExcelWriter): need_save = False else: @@ -829,13 +830,15 @@ def write( ) need_save = True - formatted_cells = self.get_formatted_cells() - writer.write_cells( - formatted_cells, - sheet_name, - startrow=startrow, - startcol=startcol, - freeze_panes=freeze_panes, - ) - if need_save: - writer.save() + try: + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + finally: + # make sure to close opened file handles + if need_save: + writer.close() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2fae18bd76657..db34b882a3c35 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,7 +5,6 @@ from contextlib import contextmanager from csv import QUOTE_NONE, QUOTE_NONNUMERIC -from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -36,7 +35,6 @@ from pandas._libs import lib from pandas._libs.missing import NA -from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( @@ -831,7 +829,7 @@ def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: dtypes = self.frame.dtypes._values # if we have a Float level, they don't use leading space at all - restrict_formatting = any(l.is_floating for l in columns.levels) + restrict_formatting = any(level.is_floating for level in columns.levels) need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): @@ -1529,11 +1527,9 @@ def _format_strings(self) -> List[str]: if self.formatter is not None and callable(self.formatter): return [self.formatter(x) for x in values] - fmt_values = format_array_from_datetime( - values.asi8.ravel(), - format=get_format_datetime64_from_values(values, self.date_format), - na_rep=self.nat_rep, - ).reshape(values.shape) + fmt_values = values._data._format_native_types( + na_rep=self.nat_rep, date_format=self.date_format + ) return fmt_values.tolist() @@ -1541,7 +1537,9 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: values = extract_array(self.values, extract_numpy=True) - formatter = values._formatter(boxed=True) + formatter = self.formatter + if formatter is None: + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo @@ -1557,7 +1555,9 @@ def _format_strings(self) -> List[str]: digits=self.digits, space=self.space, justify=self.justify, + decimal=self.decimal, leading_space=self.leading_space, + quoting=self.quoting, ) return fmt_values @@ -1653,30 +1653,21 @@ def is_dates_only( return False -def _format_datetime64( - x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" -) -> str: - if x is None or (is_scalar(x) and isna(x)): +def _format_datetime64(x: Union[NaTType, Timestamp], nat_rep: str = "NaT") -> str: + if x is NaT: return nat_rep - if tz is not None or not isinstance(x, Timestamp): - if getattr(x, "tzinfo", None) is not None: - x = Timestamp(x).tz_convert(tz) - else: - x = Timestamp(x).tz_localize(tz) - return str(x) def _format_datetime64_dateonly( - x: Union[NaTType, Timestamp], nat_rep: str = "NaT", date_format: None = None + x: Union[NaTType, Timestamp], + nat_rep: str = "NaT", + date_format: Optional[str] = None, ) -> str: - if x is None or (is_scalar(x) and isna(x)): + if x is NaT: return nat_rep - if not isinstance(x, Timestamp): - x = Timestamp(x) - if date_format: return x.strftime(date_format) else: @@ -1684,15 +1675,15 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: Optional[str] = None ) -> Callable: if is_dates_only: - return lambda x, tz=None: _format_datetime64_dateonly( + return lambda x: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) else: - return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) + return lambda x: _format_datetime64(x, nat_rep=nat_rep) def get_format_datetime64_from_values( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 891b3ea7af0e2..98bd159c567b1 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,10 +1,20 @@ from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union +from typing import ( + IO, + TYPE_CHECKING, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Union, +) from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeries +from pandas._typing import Dtype, FrameOrSeriesUnion from pandas.core.indexes.api import Index @@ -13,7 +23,6 @@ if TYPE_CHECKING: from pandas.core.frame import DataFrame - from pandas.core.series import Series def _put_str(s: Union[str, Dtype], space: int) -> str: @@ -83,11 +92,12 @@ def _initialize_memory_usage( class BaseInfo(ABC): - """Base class for DataFrameInfo and SeriesInfo. + """ + Base class for DataFrameInfo and SeriesInfo. Parameters ---------- - data : FrameOrSeries + data : DataFrame or Series Either dataframe or series. memory_usage : bool or str, optional If "deep", introspect the data deeply by interrogating object dtypes @@ -95,18 +105,20 @@ class BaseInfo(ABC): values. """ - def __init__( - self, - data: FrameOrSeries, - memory_usage: Optional[Union[bool, str]] = None, - ): - self.data = data - self.memory_usage = _initialize_memory_usage(memory_usage) + data: FrameOrSeriesUnion + memory_usage: Union[bool, str] @property @abstractmethod - def ids(self) -> Index: - """Column names or index names.""" + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes : sequence + Dtype of each of the DataFrame's columns (or one series column). + """ @property @abstractmethod @@ -120,30 +132,15 @@ def non_null_counts(self) -> Sequence[int]: @property @abstractmethod - def dtypes(self) -> "Series": - """Dtypes. - - Returns - ------- - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property def memory_usage_bytes(self) -> int: - """Memory usage in bytes. + """ + Memory usage in bytes. Returns ------- memory_usage_bytes : int Object's total memory usage in bytes. """ - if self.memory_usage == "deep": - deep = True - else: - deep = False - return self.data.memory_usage(index=True, deep=deep).sum() @property def memory_usage_string(self) -> str: @@ -165,49 +162,8 @@ def size_qualifier(self) -> str: size_qualifier = "+" return size_qualifier - -class DataFrameInfo(BaseInfo): - """Class storing dataframe-specific info.""" - - @property - def ids(self) -> Index: - """Column names. - - Returns - ------- - ids : Index - DataFrame's column names. - """ - return self.data.columns - - @property - def dtypes(self) -> "Series": - """Dtypes. - - Returns - ------- - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - # groupby dtype.name to collect e.g. Categorical columns - return self.dtypes.value_counts().groupby(lambda x: x.name).sum() - - @property - def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns.""" - return self.data.count() - - @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return len(self.ids) - - def to_buffer( + @abstractmethod + def render( self, *, buf: Optional[IO[str]], @@ -220,6 +176,7 @@ def to_buffer( This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. + %(version_added_sub)s\ Parameters ---------- @@ -246,12 +203,7 @@ def to_buffer( consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. + %(show_counts_sub)s Returns ------- @@ -266,7 +218,76 @@ def to_buffer( -------- %(examples_sub)s """ - printer = InfoPrinter( + + +class DataFrameInfo(BaseInfo): + """ + Class storing dataframe-specific info. + """ + + def __init__( + self, + data: "DataFrame", + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data: "DataFrame" = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + @property + def dtype_counts(self) -> Mapping[str, int]: + return _get_dataframe_dtype_counts(self.data) + + @property + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + + @property + def ids(self) -> Index: + """ + Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + return self.data.count() + + @property + def memory_usage_bytes(self) -> int: + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() + + def render( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: + printer = DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -275,8 +296,27 @@ def to_buffer( printer.to_buffer(buf) -class InfoPrinter: - """Class for printing dataframe or series info. +class InfoPrinterAbstract: + """ + Class for printing dataframe or series info. + """ + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + + @abstractmethod + def _create_table_builder(self) -> "TableBuilderAbstract": + """Create instance of table builder.""" + + +class DataFrameInfoPrinter(InfoPrinterAbstract): + """ + Class for printing dataframe info. Parameters ---------- @@ -334,14 +374,6 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: else: return show_counts - def to_buffer(self, buf: Optional[IO[str]] = None) -> None: - """Save dataframe info into buffer.""" - table_builder = self._create_table_builder() - lines = table_builder.get_lines() - if buf is None: # pragma: no cover - buf = sys.stdout - fmt.buffer_put_lines(buf, lines) - def _create_table_builder(self) -> "DataFrameTableBuilder": """ Create instance of table builder based on verbosity and display settings. @@ -364,26 +396,73 @@ def _create_table_builder(self) -> "DataFrameTableBuilder": class TableBuilderAbstract(ABC): - """Abstract builder for info table. - - Parameters - ---------- - info : BaseInfo - Instance of DataFrameInfo or SeriesInfo. + """ + Abstract builder for info table. """ _lines: List[str] - - def __init__(self, *, info): - self.info = info + info: BaseInfo @abstractmethod def get_lines(self) -> List[str]: """Product in a form of list of lines (strings).""" + @property + def data(self) -> FrameOrSeriesUnion: + return self.info.data + + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return bool(self.info.memory_usage) + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + class DataFrameTableBuilder(TableBuilderAbstract): - """Abstract builder for dataframe info table.""" + """ + Abstract builder for dataframe info table. + + Parameters + ---------- + info : DataFrameInfo. + Instance of DataFrameInfo. + """ + + def __init__(self, *, info: DataFrameInfo): + self.info: DataFrameInfo = info def get_lines(self) -> List[str]: self._lines = [] @@ -399,144 +478,62 @@ def _fill_empty_info(self) -> None: self.add_index_range_line() self._lines.append(f"Empty {type(self.data).__name__}") + @abstractmethod def _fill_non_empty_info(self) -> None: """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_header_line() - self.add_separator_line() - self.add_body_lines() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() @property def data(self) -> "DataFrame": """DataFrame.""" return self.info.data - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - return self.info.dtype_counts - - @property - def non_null_counts(self) -> Sequence[int]: - return self.info.non_null_counts - - @property - def display_memory_usage(self) -> bool: - """Whether to display memory usage.""" - return self.info.memory_usage - - @property - def memory_usage_string(self) -> str: - """Memory usage string with proper size qualifier.""" - return self.info.memory_usage_string - @property def ids(self) -> Index: """Dataframe columns.""" return self.info.ids - @property - def dtypes(self) -> "Series": - """Dtypes of each of the DataFrame's columns.""" - return self.info.dtypes - @property def col_count(self) -> int: """Number of dataframe columns to be summarized.""" return self.info.col_count - def add_object_type_line(self) -> None: - """Add line with string representation of dataframe to the table.""" - self._lines.append(str(type(self.data))) - - def add_index_range_line(self) -> None: - """Add line with range of indices to the table.""" - self._lines.append(self.data.index._summary()) - - @abstractmethod - def add_columns_summary_line(self) -> None: - """Add line with columns summary to the table.""" - - @abstractmethod - def add_header_line(self) -> None: - """Add header line to the table.""" - - @abstractmethod - def add_separator_line(self) -> None: - """Add separator line between header and body of the table.""" - - @abstractmethod - def add_body_lines(self) -> None: - """Add content of the table body.""" - - def add_dtypes_line(self) -> None: - """Add summary line with dtypes present in dataframe.""" - collected_dtypes = [ - f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) - ] - self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") - def add_memory_usage_line(self) -> None: """Add line containing memory usage.""" self._lines.append(f"memory usage: {self.memory_usage_string}") class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): - """Info table builder for non-verbose output.""" + """ + Dataframe info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) - def add_header_line(self) -> None: - """No header in non-verbose output.""" - - def add_separator_line(self) -> None: - """No separator in non-verbose output.""" - def add_body_lines(self) -> None: - """No body in non-verbose output.""" - - -class DataFrameTableBuilderVerbose(DataFrameTableBuilder): - """Info table builder for verbose output.""" - - SPACING = " " * 2 +class TableBuilderVerboseMixin(TableBuilderAbstract): + """ + Mixin for verbose info output. + """ - def __init__( - self, - *, - info: DataFrameInfo, - with_counts: bool, - ): - super().__init__(info=info) - self.with_counts = with_counts - self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + SPACING: str = " " * 2 + strrows: Sequence[Sequence[str]] + gross_column_widths: Sequence[int] + with_counts: bool @property + @abstractmethod def headers(self) -> Sequence[str]: """Headers names of the columns in verbose table.""" - if self.with_counts: - return [" # ", "Column", "Non-Null Count", "Dtype"] - return [" # ", "Column", "Dtype"] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - """Generator function yielding rows content. - - Each element represents a row comprising a sequence of strings. - """ - if self.with_counts: - return self._gen_rows_with_counts() - else: - return self._gen_rows_without_counts() - - def add_columns_summary_line(self) -> None: - self._lines.append(f"Data columns (total {self.col_count} columns):") @property def header_column_widths(self) -> Sequence[int]: @@ -556,6 +553,25 @@ def _get_body_column_widths(self) -> Sequence[int]: strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) return [max(len(x) for x in col) for col in strcols] + def _gen_rows(self) -> Iterator[Sequence[str]]: + """ + Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + @abstractmethod + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + + @abstractmethod + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + def add_header_line(self) -> None: header_line = self.SPACING.join( [ @@ -586,6 +602,55 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): + """ + Dataframe info table builder for verbose output. + """ + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: """Iterator with string representation of body data without counts.""" yield from zip( @@ -613,12 +678,10 @@ def _gen_columns(self) -> Iterator[str]: for col in self.ids: yield pprint_thing(col) - def _gen_dtypes(self) -> Iterator[str]: - """Iterator with string representation of column dtypes.""" - for dtype in self.dtypes: - yield pprint_thing(dtype) - def _gen_non_null_counts(self) -> Iterator[str]: - """Iterator with string representation of non-null counts.""" - for count in self.non_null_counts: - yield f"{count} non-null" +def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: + """ + Create mapping between datatypes and their number of occurences. + """ + # groupby dtype.name to collect e.g. Categorical columns + return df.dtypes.value_counts().groupby(lambda x: x.name).sum() diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 72b07000146b2..ac453839792f3 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -308,7 +308,7 @@ def format_object_summary( name : name, optional defaults to the class name of the obj indent_for_name : bool, default True - Whether subsequent lines should be be indented to + Whether subsequent lines should be indented to align with the name. line_break_each_value : bool, default False If True, inserts a line break for each value of ``obj``. diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 2f3416cbf2d87..0eeff44d0f74c 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,7 +1,6 @@ """ Module for applying conditional formatting to DataFrames and Series. """ - from collections import defaultdict from contextlib import contextmanager import copy @@ -33,6 +32,7 @@ import pandas as pd from pandas.api.types import is_dict_like, is_list_like +from pandas.core import generic import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -204,7 +204,11 @@ def _repr_html_(self) -> str: """ return self.render() - @doc(NDFrame.to_excel, klass="Styler") + @doc( + NDFrame.to_excel, + klass="Styler", + storage_options=generic._shared_docs["storage_options"], + ) def to_excel( self, excel_writer, @@ -561,7 +565,6 @@ def set_td_classes(self, classes: DataFrame) -> "Styler": ' 1' ' ' '' - """ classes = classes.reindex_like(self.data) @@ -900,7 +903,7 @@ def set_table_attributes(self, attributes: str) -> "Styler": Set the table attributes. These are the items that show up in the opening ```` tag - in addition to to automatic (by default) id. + in addition to automatic (by default) id. Parameters ---------- @@ -987,20 +990,46 @@ def set_caption(self, caption: str) -> "Styler": self.caption = caption return self - def set_table_styles(self, table_styles) -> "Styler": + def set_table_styles(self, table_styles, axis=0, overwrite=True) -> "Styler": """ Set the table styles on a Styler. These are placed in a ``