From 7cd25a227259b540444480b89170e22399f70737 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 13 Jun 2022 16:03:21 -0400 Subject: [PATCH 001/136] add-recommended-dependencies-as-extras_require-in-setup.cfg See issue #47335. https://github.com/pandas-dev/pandas/issues/47335 recommended dependencies should have package mgmt facilitated through pandas. This will make mgmt of pandas in production docker environments a lot simpler. --- setup.cfg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.cfg b/setup.cfg index d3c4fe0cb35ce..e174b75df1ee0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,6 +50,12 @@ test = hypothesis>=5.5.3 pytest>=6.0 pytest-xdist>=1.31 +# optional extras for recommended dependencies +# see: doc/source/getting_started/install.rst +numexpr = + numexpr>=2.7.1 +bottleneck = + bottleneck>=1.3.1 [build_ext] inplace = True From eab49c777de80178493e9cba967cbdeecb07ec61 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 13 Jun 2022 16:12:03 -0400 Subject: [PATCH 002/136] Update v1.4.3.rst --- doc/source/whatsnew/v1.4.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index ca8b8ca15ec47..551e665eef943 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- +- Recommended dependencies `numexpr` & `bottleneck` can be managed as extras in a requirements/setup file pandas[numexpr,bottleneck]>=1.4.3 (:issue:`47335`) .. --------------------------------------------------------------------------- From 645f0467cadc17aa42ba6b70f48613aedc6357a9 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 13 Jun 2022 16:28:53 -0400 Subject: [PATCH 003/136] double backtick code in rst --- doc/source/whatsnew/v1.4.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 551e665eef943..d80d28a23be60 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- Recommended dependencies `numexpr` & `bottleneck` can be managed as extras in a requirements/setup file pandas[numexpr,bottleneck]>=1.4.3 (:issue:`47335`) +- Recommended dependencies ``numexpr`` & ``bottleneck`` can be managed as extras in a requirements/setup file eg. ``pandas[numexpr,bottleneck]>=1.4.3`` (:issue:`47335`) .. --------------------------------------------------------------------------- From 897d98d1bcdb5d5832ff8982c8fa55141c326f23 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 16 Jun 2022 14:31:02 -0400 Subject: [PATCH 004/136] rebundle under extras_require `recommended` --- doc/source/whatsnew/v1.4.3.rst | 2 +- setup.cfg | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index d80d28a23be60..10b60e5629917 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- Recommended dependencies ``numexpr`` & ``bottleneck`` can be managed as extras in a requirements/setup file eg. ``pandas[numexpr,bottleneck]>=1.4.3`` (:issue:`47335`) +- Recommended dependencies ``numexpr`` & ``bottleneck`` can be managed as extras in a requirements/setup file eg. ``pandas[recommended]>=1.4.3`` (:issue:`47335`) .. --------------------------------------------------------------------------- diff --git a/setup.cfg b/setup.cfg index e174b75df1ee0..e18cec6efd56f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,9 +52,8 @@ test = pytest-xdist>=1.31 # optional extras for recommended dependencies # see: doc/source/getting_started/install.rst -numexpr = +recommended = numexpr>=2.7.1 -bottleneck = bottleneck>=1.3.1 [build_ext] From 3cbcba626d33094bcd39aaa50c585ee4ef7201b6 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 16:47:59 -0400 Subject: [PATCH 005/136] [options.extras_require] bundled dependencies by featureset see #39164 for discussion. --- setup.cfg | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index e18cec6efd56f..6593f53de7603 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,9 +53,51 @@ test = # optional extras for recommended dependencies # see: doc/source/getting_started/install.rst recommended = - numexpr>=2.7.1 bottleneck>=1.3.1 - + numba>=0.50.1 + numexpr>=2.7.1 +computation = + numba>=0.50.1 + scipy >=1.4.1 + xarray>=0.15.1 +s3 = + boto3>=1.22.7 + s3fs>=0.4.0 +gcp = + gcsfs>=0.6.0 + pandas-gbq>=0.14.0 +azure = + adlfs >=0.6.0 +excel = + openpyxl >=3.0.3 + pyxlsb >=1.0.6 + xlrd >=2.0.1 + xlwt >=1.3.0 + xlsxwriter>=1.2.2 +parquet = + pyarrow>=1.0.1 +feather = + pyarrow>=1.0.1 +hdf5 = + blosc>=1.20.1 + PyTables>=3.6.1 +sql-postgressql = + SQLAlchemy>=1.4.0 + psycopg2>=2.8.4 +ssql-mysql = + SQLAlchemy>=1.4.0 + pymysql>=0.10.1 +sql-other = + SQLAlchemy>=1.4.0 +html = + BeautifulSoup4>=4.8.2 + html5lib>=1.1 + lxml>=4.5.0 +plot = + matplotlib>=3.3.2 +table = + jinja2>=2.11 + tabulate>=0.8.7 [build_ext] inplace = True From 80842b1b53fda987576de391bc703e7ef0f3967c Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 16:48:14 -0400 Subject: [PATCH 006/136] note: [options.extras_require] bundled dependencies by featureset --- doc/source/whatsnew/v1.4.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 10b60e5629917..7e590b743661b 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- Recommended dependencies ``numexpr`` & ``bottleneck`` can be managed as extras in a requirements/setup file eg. ``pandas[recommended]>=1.4.3`` (:issue:`47335`) +- Optional Pandas dependencies can be managed now be managed as extras in a requirements/setup file eg. ``pandas[recommended, s3]>=1.4.3``. Available optional dependencies are ``[recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) .. --------------------------------------------------------------------------- From 0027238c91d987a0f511e83fa567e74574ad8393 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 17:10:31 -0400 Subject: [PATCH 007/136] Update setup.cfg rollback numba from recommended. This would necessitate and update to documentation that requires broad agreement from pd-dev-core team that would slow down the overall PR --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 19db82634f2a3..50715397df9b7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,7 +59,6 @@ test = # see: doc/source/getting_started/install.rst recommended = bottleneck>=1.3.1 - numba>=0.50.1 numexpr>=2.7.1 computation = numba>=0.50.1 From 9f2beee88b72008e83ba6e8c8a20b874e266fbbe Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 17:18:11 -0400 Subject: [PATCH 008/136] add adlfs for azure to `access data in cloud` see #39164 for discussion. 0.6.0 might be an overly restrictive version, but it is compatible --- doc/source/getting_started/install.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 39c9db2c883b8..94244ea9634ac 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -400,6 +400,7 @@ fsspec 0.7.4 Handling files aside from simple lo gcsfs 0.6.0 Google Cloud Storage access pandas-gbq 0.14.0 Google Big Query access s3fs 0.4.0 Amazon S3 access +adlfs 0.6.0 Microsoft Azure access ========================= ================== ============================================================= Clipboard From 331423d801383817ae9ff361a5057414971bdce9 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 18:04:59 -0400 Subject: [PATCH 009/136] fix extras_require: PyTables is actually `tables` on PyPi --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 50715397df9b7..760013e7554d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,7 +84,7 @@ feather = pyarrow>=1.0.1 hdf5 = blosc>=1.20.1 - PyTables>=3.6.1 + tables>=3.6.1 sql-postgressql = SQLAlchemy>=1.4.0 psycopg2>=2.8.4 @@ -94,7 +94,7 @@ ssql-mysql = sql-other = SQLAlchemy>=1.4.0 html = - BeautifulSoup4>=4.8.2 + beautifulsoup4>=4.8.2 html5lib>=1.1 lxml>=4.5.0 plot = From afc89a0ec1e1b65e325728db733567cc4fc0bbac Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 12 Jul 2022 18:26:15 -0400 Subject: [PATCH 010/136] Update setup.cfg --- setup.cfg | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 760013e7554d6..59e8440784a94 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,10 +73,10 @@ gcp = azure = adlfs >=0.6.0 excel = - openpyxl >=3.0.3 - pyxlsb >=1.0.6 - xlrd >=2.0.1 - xlwt >=1.3.0 + openpyxl>=3.0.3 + pyxlsb>=1.0.6 + xlrd>=2.0.1 + xlwt>=1.3.0 xlsxwriter>=1.2.2 parquet = pyarrow>=1.0.1 From 1a86f0c976110da9107b9933ab4a6091f387ca50 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Wed, 13 Jul 2022 12:35:22 -0400 Subject: [PATCH 011/136] add `all` option to [options.extras_require] --- setup.cfg | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/setup.cfg b/setup.cfg index 59e8440784a94..47a45e90064a1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -102,6 +102,38 @@ plot = table = jinja2>=2.11 tabulate>=0.8.7 +# `all` supersets above options and individual compression libraries +all = + adlfs >=0.6.0 + beautifulsoup4>=4.8.2 + blosc>=1.20.1 + bottleneck>=1.3.1 + boto3>=1.22.7 + brotli>=0.7.0 + gcsfs>=0.6.0 + html5lib>=1.1 + jinja2>=2.11 + lxml>=4.5.0 + matplotlib>=3.3.2 + numba>=0.50.1 + numexpr>=2.7.1 + openpyxl>=3.0.3 + pandas-gbq>=0.14.0 + psycopg2>=2.8.4 + pyarrow>=1.0.1 + pymysql>=0.10.1 + python-snappy>=0.6.0 + pyxlsb>=1.0.6 + scipy >=1.4.1 + s3fs>=0.4.0 + SQLAlchemy>=1.4.0 + tables>=3.6.1 + tabulate>=0.8.7 + xarray>=0.15.1 + xlrd>=2.0.1 + xlwt>=1.3.0 + xlsxwriter>=1.2.2 + zstandard>=0.15.2 [build_ext] inplace = True From d0d87ead9312161245c421fe9853ef522b64e3f5 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 14 Jul 2022 15:39:40 -0400 Subject: [PATCH 012/136] moved changelog to 1.4.4 as 1.4.3 released while this PR was stalled --- doc/source/whatsnew/v1.4.3.rst | 1 - doc/source/whatsnew/v1.4.4.rst | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 294cead312071..70b451a231453 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -61,7 +61,6 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- Optional Pandas dependencies can be managed now be managed as extras in a requirements/setup file eg. ``pandas[recommended, s3]>=1.4.3``. Available optional dependencies are ``[recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 6ee140f59e096..2990bd27f8e82 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -32,8 +32,7 @@ Bug fixes Other ~~~~~ -- -- +- Optional Pandas dependencies can be managed now be managed as extras in a requirements/setup file eg. ``pandas[recommended, s3]>=1.4.3``. Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) .. --------------------------------------------------------------------------- From 98f4b12b3e3023cdfd4b243051574d4b45dba11b Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 14 Jul 2022 16:58:05 -0400 Subject: [PATCH 013/136] Updated to 1.5.0 compliance --- doc/source/whatsnew/v1.4.4.rst | 3 +- doc/source/whatsnew/v1.5.0.rst | 12 ++++++ setup.cfg | 68 +++++++++++++++++----------------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 2990bd27f8e82..6ee140f59e096 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -32,7 +32,8 @@ Bug fixes Other ~~~~~ -- Optional Pandas dependencies can be managed now be managed as extras in a requirements/setup file eg. ``pandas[recommended, s3]>=1.4.3``. Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) +- +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a6408b940119d..7c4300f993c47 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -245,6 +245,18 @@ and attributes without holding entire tree in memory (:issue:`45442`). .. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk .. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse +.. _whatsnew_150.enhancements.Optional_dependency_management: + +Optional dependencies version management +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Optional Pandas dependencies can be managed as extras in a requirements/setup file, for example: + +.. code-block:: python + + pandas[recommended, s3]>=1.5.0 + +Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) + .. _whatsnew_150.enhancements.other: Other enhancements diff --git a/setup.cfg b/setup.cfg index 47a45e90064a1..155bb4c0ff0e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,26 +58,26 @@ test = # optional extras for recommended dependencies # see: doc/source/getting_started/install.rst recommended = - bottleneck>=1.3.1 + bottleneck>=1.3.2 numexpr>=2.7.1 computation = - numba>=0.50.1 - scipy >=1.4.1 - xarray>=0.15.1 + numba>=0.53.0 + scipy >=1.7.1 + xarray>=0.19.0 s3 = boto3>=1.22.7 s3fs>=0.4.0 gcp = - gcsfs>=0.6.0 - pandas-gbq>=0.14.0 + gcsfs>=2021.05.0 + pandas-gbq>=0.15.0 azure = adlfs >=0.6.0 excel = - openpyxl>=3.0.3 - pyxlsb>=1.0.6 + openpyxl>=3.0.7 + pyxlsb>=1.0.8 xlrd>=2.0.1 xlwt>=1.3.0 - xlsxwriter>=1.2.2 + xlsxwriter>=1.4.3 parquet = pyarrow>=1.0.1 feather = @@ -86,53 +86,53 @@ hdf5 = blosc>=1.20.1 tables>=3.6.1 sql-postgressql = - SQLAlchemy>=1.4.0 - psycopg2>=2.8.4 + SQLAlchemy>=1.4.16 + psycopg2>=2.8.6 ssql-mysql = - SQLAlchemy>=1.4.0 - pymysql>=0.10.1 + SQLAlchemy>=1.4.16 + pymysql>=1.0.2 sql-other = - SQLAlchemy>=1.4.0 + SQLAlchemy>=1.4.16 html = - beautifulsoup4>=4.8.2 + beautifulsoup4>=4.9.3 html5lib>=1.1 - lxml>=4.5.0 + lxml>=4.6.3 plot = matplotlib>=3.3.2 table = - jinja2>=2.11 - tabulate>=0.8.7 + jinja2>=3.0.0 + tabulate>=0.8.9 # `all` supersets above options and individual compression libraries all = adlfs >=0.6.0 - beautifulsoup4>=4.8.2 - blosc>=1.20.1 + beautifulsoup4>=4.9.3 + blosc>=1.21.0 bottleneck>=1.3.1 boto3>=1.22.7 brotli>=0.7.0 - gcsfs>=0.6.0 + gcsfs>=2021.05.0 html5lib>=1.1 - jinja2>=2.11 - lxml>=4.5.0 + jinja2>=3.0.0 + lxml>=4.6.3 matplotlib>=3.3.2 - numba>=0.50.1 + numba>=0.53.0 numexpr>=2.7.1 - openpyxl>=3.0.3 - pandas-gbq>=0.14.0 - psycopg2>=2.8.4 + openpyxl>=3.0.7 + pandas-gbq>=0.15.0 + psycopg2>=2.8.6 pyarrow>=1.0.1 - pymysql>=0.10.1 + pymysql>=1.0.2 python-snappy>=0.6.0 - pyxlsb>=1.0.6 - scipy >=1.4.1 + pyxlsb>=1.0.8 + scipy >=1.7.1 s3fs>=0.4.0 - SQLAlchemy>=1.4.0 + SQLAlchemy>=1.4.16 tables>=3.6.1 - tabulate>=0.8.7 - xarray>=0.15.1 + tabulate>=0.8.9 + xarray>=0.19.0 xlrd>=2.0.1 xlwt>=1.3.0 - xlsxwriter>=1.2.2 + xlsxwriter>=1.4.3 zstandard>=0.15.2 [build_ext] inplace = True From aab95469e4c5fdc5076ab34524a4304fc55ef886 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 18:39:32 -0400 Subject: [PATCH 014/136] simplify sql option names --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 155bb4c0ff0e5..969613f35d706 100644 --- a/setup.cfg +++ b/setup.cfg @@ -85,10 +85,10 @@ feather = hdf5 = blosc>=1.20.1 tables>=3.6.1 -sql-postgressql = +postgressql = SQLAlchemy>=1.4.16 psycopg2>=2.8.6 -ssql-mysql = +mysql = SQLAlchemy>=1.4.16 pymysql>=1.0.2 sql-other = From eb461ac392e7ed93cda190576e3eb4b02199f89e Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 18:52:54 -0400 Subject: [PATCH 015/136] extras rename: recommended -> performance --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 969613f35d706..18ca10e680995 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,8 +57,9 @@ test = pytest-xdist>=1.31 # optional extras for recommended dependencies # see: doc/source/getting_started/install.rst -recommended = +performance = bottleneck>=1.3.2 + numba>=0.53.0 numexpr>=2.7.1 computation = numba>=0.53.0 From 1a277b5662d1ccdf15d39aaf79b4524deead68e6 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 18:54:05 -0400 Subject: [PATCH 016/136] remove azure support is currently unofficial as of 1.5.0 --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 18ca10e680995..99a4e166d83ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -71,8 +71,6 @@ s3 = gcp = gcsfs>=2021.05.0 pandas-gbq>=0.15.0 -azure = - adlfs >=0.6.0 excel = openpyxl>=3.0.7 pyxlsb>=1.0.8 From f39af49f4ca6051fb078b5ca644bb822d8f3e87f Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 19:25:00 -0400 Subject: [PATCH 017/136] align with actions-38-minimum_versions.yaml add specific installs and, where required, missing install documentation for - odfpy - pyreadstat - compression options --- doc/source/getting_started/install.rst | 1 + setup.cfg | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 5d9bfd97030b5..2d77f025296e4 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -373,6 +373,7 @@ zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 1.0.1 Parquet, ORC, and feather reading / writing pyreadstat 1.1.2 SPSS files (.sav) reading +odfpy 1.4.1 Open document format (.odf, .ods, .odt) reading / writing ========================= ================== ============================================================= .. _install.warn_orc: diff --git a/setup.cfg b/setup.cfg index 99a4e166d83ce..100c552fea86a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,6 +72,7 @@ gcp = gcsfs>=2021.05.0 pandas-gbq>=0.15.0 excel = + odfpy=1.4.1 openpyxl>=3.0.7 pyxlsb>=1.0.8 xlrd>=2.0.1 @@ -84,6 +85,8 @@ feather = hdf5 = blosc>=1.20.1 tables>=3.6.1 +spss = + pyreadstat=1.1.2 postgressql = SQLAlchemy>=1.4.16 psycopg2>=2.8.6 @@ -101,6 +104,10 @@ plot = table = jinja2>=3.0.0 tabulate>=0.8.9 +compression = + brotlipy=0.7.0 + python-snappy=0.6.0 + zstandard=0.15.2 # `all` supersets above options and individual compression libraries all = adlfs >=0.6.0 @@ -116,11 +123,13 @@ all = matplotlib>=3.3.2 numba>=0.53.0 numexpr>=2.7.1 + odfpy=1.4.1 openpyxl>=3.0.7 pandas-gbq>=0.15.0 psycopg2>=2.8.6 pyarrow>=1.0.1 pymysql>=1.0.2 + pyreadstat=1.1.2 python-snappy>=0.6.0 pyxlsb>=1.0.8 scipy >=1.7.1 From f2238be0515affb679ebdb517c8c4e6407b2898a Mon Sep 17 00:00:00 2001 From: JMBurley Date: Mon, 18 Jul 2022 19:26:30 -0400 Subject: [PATCH 018/136] Pandas -> pandas in doc Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 729898813cf6e..ee7632229a637 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -249,7 +249,7 @@ and attributes without holding entire tree in memory (:issue:`45442`). Optional dependencies version management ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Optional Pandas dependencies can be managed as extras in a requirements/setup file, for example: +Optional pandas dependencies can be managed as extras in a requirements/setup file, for example: .. code-block:: python From 4c10b076e45dd8c70bf012a86fa23ae03af99df3 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 19:46:51 -0400 Subject: [PATCH 019/136] extras rename: s3 -> aws see https://github.com/pandas-dev/pandas/pull/47336#discussion_r923930271 --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 100c552fea86a..5969365e225fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,7 @@ computation = numba>=0.53.0 scipy >=1.7.1 xarray>=0.19.0 -s3 = +aws = boto3>=1.22.7 s3fs>=0.4.0 gcp = @@ -142,6 +142,7 @@ all = xlwt>=1.3.0 xlsxwriter>=1.4.3 zstandard>=0.15.2 + [build_ext] inplace = True From ba22552f647898f9f6becde2c2d720e6608271e9 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 19:49:09 -0400 Subject: [PATCH 020/136] extras rename: table -> output_formatting to be more general in case of future changes --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5969365e225fb..c3e5ac04a9b64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -101,7 +101,7 @@ html = lxml>=4.6.3 plot = matplotlib>=3.3.2 -table = +output_formatting = jinja2>=3.0.0 tabulate>=0.8.9 compression = From 601b3af910b506ea8de90b7b14559de4e0597bbe Mon Sep 17 00:00:00 2001 From: Jonathan Date: Mon, 18 Jul 2022 19:54:30 -0400 Subject: [PATCH 021/136] bug: `>=` not `=` --- setup.cfg | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/setup.cfg b/setup.cfg index c3e5ac04a9b64..06c9e1b4b181a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,7 +72,7 @@ gcp = gcsfs>=2021.05.0 pandas-gbq>=0.15.0 excel = - odfpy=1.4.1 + odfpy>=1.4.1 openpyxl>=3.0.7 pyxlsb>=1.0.8 xlrd>=2.0.1 @@ -86,7 +86,7 @@ hdf5 = blosc>=1.20.1 tables>=3.6.1 spss = - pyreadstat=1.1.2 + pyreadstat>=1.1.2 postgressql = SQLAlchemy>=1.4.16 psycopg2>=2.8.6 @@ -105,12 +105,11 @@ output_formatting = jinja2>=3.0.0 tabulate>=0.8.9 compression = - brotlipy=0.7.0 - python-snappy=0.6.0 - zstandard=0.15.2 + brotlipy>=0.7.0 + python-snappy>=0.6.0 + zstandard>=0.15.2 # `all` supersets above options and individual compression libraries all = - adlfs >=0.6.0 beautifulsoup4>=4.9.3 blosc>=1.21.0 bottleneck>=1.3.1 @@ -123,13 +122,13 @@ all = matplotlib>=3.3.2 numba>=0.53.0 numexpr>=2.7.1 - odfpy=1.4.1 + odfpy>=1.4.1 openpyxl>=3.0.7 pandas-gbq>=0.15.0 psycopg2>=2.8.6 pyarrow>=1.0.1 pymysql>=1.0.2 - pyreadstat=1.1.2 + pyreadstat>=1.1.2 python-snappy>=0.6.0 pyxlsb>=1.0.8 scipy >=1.7.1 From afda3dcfcb38270ad50b679bcbc9ac0e4e67e5c2 Mon Sep 17 00:00:00 2001 From: JMBurley Date: Tue, 19 Jul 2022 09:51:24 -0400 Subject: [PATCH 022/136] Apply suggestions from code review Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 650adab8cd1fe..6bd50e5ee1b71 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -245,7 +245,7 @@ and attributes without holding entire tree in memory (:issue:`45442`). .. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk .. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse -.. _whatsnew_150.enhancements.Optional_dependency_management: +.. _whatsnew_150.enhancements.optional_dependency_management: Optional dependencies version management ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -255,7 +255,7 @@ Optional pandas dependencies can be managed as extras in a requirements/setup fi pandas[recommended, s3]>=1.5.0 -Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather,hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) +Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather, hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) .. _whatsnew_150.enhancements.other: From ab41dfbd90432de0c724019e6d0eda4e34f3e6d5 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 19 Jul 2022 09:58:35 -0400 Subject: [PATCH 023/136] align 1.5.0.rst to latest extras_require updates --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- setup.cfg | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6bd50e5ee1b71..4ccd0f037f7a8 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -248,14 +248,14 @@ and attributes without holding entire tree in memory (:issue:`45442`). .. _whatsnew_150.enhancements.optional_dependency_management: Optional dependencies version management -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Optional pandas dependencies can be managed as extras in a requirements/setup file, for example: .. code-block:: python pandas[recommended, s3]>=1.5.0 -Available optional dependencies are ``[all, recommended, computation, s3, gcp, azure, excel, parquet, feather, hdf5, sql-postgressql, ssql-mysql, sql-other, html, plot, table]`` (:issue:`39164`) +Available optional dependencies are ``[all, performance, computation, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, plot, output_formatting, compression, test]`` (:issue:`39164`) .. _whatsnew_150.enhancements.other: diff --git a/setup.cfg b/setup.cfg index 06c9e1b4b181a..acfdcb4c5c0b5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -87,7 +87,7 @@ hdf5 = tables>=3.6.1 spss = pyreadstat>=1.1.2 -postgressql = +postgresql = SQLAlchemy>=1.4.16 psycopg2>=2.8.6 mysql = @@ -108,7 +108,8 @@ compression = brotlipy>=0.7.0 python-snappy>=0.6.0 zstandard>=0.15.2 -# `all` supersets above options and individual compression libraries +# `all` supersets all the above options. +# Should be maintained as the complete set of pandas optional dependencies all = beautifulsoup4>=4.9.3 blosc>=1.21.0 From 9b9e7b631dd7b1498996ffa0e8805a4d94e640d1 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Tue, 19 Jul 2022 11:03:04 -0400 Subject: [PATCH 024/136] 1.5.0.rst example updated to use valid extras --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4ccd0f037f7a8..1ec1052c9dec7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -253,7 +253,7 @@ Optional pandas dependencies can be managed as extras in a requirements/setup fi .. code-block:: python - pandas[recommended, s3]>=1.5.0 + pandas[performance, aws]>=1.5.0 Available optional dependencies are ``[all, performance, computation, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, plot, output_formatting, compression, test]`` (:issue:`39164`) From 096bef90e3ff6f5170c051a0e138bfa9f7d1bfef Mon Sep 17 00:00:00 2001 From: Jonathan Date: Wed, 20 Jul 2022 17:17:27 -0400 Subject: [PATCH 025/136] add optional dep mgmt instructions to install.rst --- doc/source/getting_started/install.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2d77f025296e4..6e00cf7d89baa 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -270,6 +270,12 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. +Optional pandas dependencies can be managed as extras (e.g.,``pandas[performance, aws]>=1.5.0``) +in a requirements.txt, setup or pyproject.toml file. +Available optional dependencies are ``[all, performance, computation, aws, +gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, +plot, output_formatting, compression, test]`` + Visualization ^^^^^^^^^^^^^ From 4f7d9d178fc9222fb371305c8f817abda6b1b2a1 Mon Sep 17 00:00:00 2001 From: JMBurley Date: Thu, 21 Jul 2022 14:06:56 -0400 Subject: [PATCH 026/136] lint scipy optional import Co-authored-by: Matthew Roeschke --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index acfdcb4c5c0b5..4a1d5d03e8287 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,7 +63,7 @@ performance = numexpr>=2.7.1 computation = numba>=0.53.0 - scipy >=1.7.1 + scipy>=1.7.1 xarray>=0.19.0 aws = boto3>=1.22.7 @@ -132,7 +132,7 @@ all = pyreadstat>=1.1.2 python-snappy>=0.6.0 pyxlsb>=1.0.8 - scipy >=1.7.1 + scipy>=1.7.1 s3fs>=0.4.0 SQLAlchemy>=1.4.16 tables>=3.6.1 From cf01608e3abc39aa6b65647da786c374bbeeb13f Mon Sep 17 00:00:00 2001 From: JMBurley Date: Thu, 21 Jul 2022 14:27:02 -0400 Subject: [PATCH 027/136] Apply suggestions from code review --- setup.cfg | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 4a1d5d03e8287..1658b8443980f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -108,14 +108,14 @@ compression = brotlipy>=0.7.0 python-snappy>=0.6.0 zstandard>=0.15.2 -# `all` supersets all the above options. -# Should be maintained as the complete set of pandas optional dependencies +# `all` supersets all the above options except for `test`. +# Should be kept as the complete set of pandas optional dependencies for general use all = beautifulsoup4>=4.9.3 blosc>=1.21.0 bottleneck>=1.3.1 boto3>=1.22.7 - brotli>=0.7.0 + brotlipy>=0.7.0 gcsfs>=2021.05.0 html5lib>=1.1 jinja2>=3.0.0 @@ -139,8 +139,8 @@ all = tabulate>=0.8.9 xarray>=0.19.0 xlrd>=2.0.1 - xlwt>=1.3.0 xlsxwriter>=1.4.3 + xlwt>=1.3.0 zstandard>=0.15.2 [build_ext] From b081521a1af3985728b12301c146e35e2ea77a3f Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 21 Jul 2022 15:02:31 -0400 Subject: [PATCH 028/136] detailed extras guidance in install.rst - updated numbas to a full recommended dependency with a promotional bullet point like bottleneck and numexpr - clarified the extra to use for each set of optional dependencies - made xml an optional extra, because is does have usage outside of read_html. --- doc/source/getting_started/install.rst | 33 ++++++++++++++++++++++---- doc/source/whatsnew/v1.5.0.rst | 2 +- setup.cfg | 2 ++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 6e00cf7d89baa..a59b6d1d0c389 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -245,6 +245,9 @@ Package Minimum support Recommended dependencies ~~~~~~~~~~~~~~~~~~~~~~~~ +pandas recommends the following optional dependencies for performance gains. Can be managed via the +``performance`` extra when defining the pandas requirement. + * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. If installed, must be Version 2.7.3 or higher. @@ -253,6 +256,10 @@ Recommended dependencies evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, must be Version 1.3.2 or higher. +* `numba `__: Alternative execution engine for rolling operations. +``numba`` is a JIT compiler that translates Python functions to optimized machine code using the +LLVM compiler library. If installed, must be Version 0.53.1 or higher. + .. note:: You are highly encouraged to install these libraries, as they provide speed improvements, especially @@ -273,12 +280,14 @@ the method requiring that dependency is called. Optional pandas dependencies can be managed as extras (e.g.,``pandas[performance, aws]>=1.5.0``) in a requirements.txt, setup or pyproject.toml file. Available optional dependencies are ``[all, performance, computation, aws, -gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, +gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, xml, plot, output_formatting, compression, test]`` Visualization ^^^^^^^^^^^^^ +Can be managed with the ``plot, output_formatting`` extras, depending on the required functionality + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -290,18 +299,20 @@ tabulate 0.8.9 Printing in Markdown-friendly forma Computation ^^^^^^^^^^^ +Can be managed with the ``computation`` extra. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= SciPy 1.7.1 Miscellaneous statistical functions -numba 0.53.1 Alternative execution engine for rolling operations - (see :ref:`Enhancing Performance `) xarray 0.19.0 pandas-like API for N-dimensional data ========================= ================== ============================================================= Excel files ^^^^^^^^^^^ +Can be managed with the ``excel`` extra. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -315,6 +326,8 @@ pyxlsb 1.0.8 Reading for xlsb files HTML ^^^^ +Can be managed with the ``html`` extra. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -350,15 +363,19 @@ top-level :func:`~pandas.read_html` function: XML ^^^ +Can be managed with the ``xml`` extra. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -lxml 4.5.0 XML parser for read_xml and tree builder for to_xml +lxml 4.6.3 XML parser for read_xml and tree builder for to_xml ========================= ================== ============================================================= SQL databases ^^^^^^^^^^^^^ +Can be managed with the ``postgresql, mysql, sql-other`` extras, depending on required sql compatibility. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -370,6 +387,8 @@ pymysql 1.0.2 MySQL engine for sqlalchemy Other data sources ^^^^^^^^^^^^^^^^^^ +Can be managed with the ``hdf5, parquet, feather, spss, excel`` extras, depending on required compatibility. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -400,6 +419,8 @@ odfpy 1.4.1 Open document format (.odf, .ods, . Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ +Can be managed with the ``aws, gcp`` extras, depending on required compatibility. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -412,6 +433,8 @@ s3fs 2021.05.0 Amazon S3 access Clipboard ^^^^^^^^^ +Must be manually managed, depending on operating system. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= @@ -425,6 +448,8 @@ xsel Clipboard I/O on linux Compression ^^^^^^^^^^^ +Can be managed with the ``compression`` extra. + ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ce02a7b6df195..c6973e126e53a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -255,7 +255,7 @@ Optional pandas dependencies can be managed as extras in a requirements/setup fi pandas[performance, aws]>=1.5.0 -Available optional dependencies are ``[all, performance, computation, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, plot, output_formatting, compression, test]`` (:issue:`39164`) +Available optional dependencies are ``[all, performance, computation, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, xml, plot, output_formatting, compression, test]`` (:issue:`39164`) .. _whatsnew_150.enhancements.other: diff --git a/setup.cfg b/setup.cfg index 1658b8443980f..c5dcd9ddde69a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -99,6 +99,8 @@ html = beautifulsoup4>=4.9.3 html5lib>=1.1 lxml>=4.6.3 +xml = + lxml>=4.6.3 plot = matplotlib>=3.3.2 output_formatting = From 87006bbe251b755e5b07b7f3c581152afd998456 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 21 Jul 2022 15:09:58 -0400 Subject: [PATCH 029/136] _optional.py note to keep track of setup.cfg --- pandas/compat/_optional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index ad6c6fb839f10..9413b9c51817f 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -7,7 +7,7 @@ from pandas.util.version import Version -# Update install.rst when updating versions! +# Update install.rst & setup.cfg when updating versions! VERSIONS = { "bs4": "4.9.3", From 404b215464e67a54a7dd1826725faaa876677584 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 21 Jul 2022 15:59:39 -0400 Subject: [PATCH 030/136] bug: indent after bullet in install.rst --- doc/source/getting_started/install.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a59b6d1d0c389..61647fd6714fa 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -257,8 +257,8 @@ pandas recommends the following optional dependencies for performance gains. Can must be Version 1.3.2 or higher. * `numba `__: Alternative execution engine for rolling operations. -``numba`` is a JIT compiler that translates Python functions to optimized machine code using the -LLVM compiler library. If installed, must be Version 0.53.1 or higher. + ``numba`` is a JIT compiler that translates Python functions to optimized machine code using the + LLVM compiler library. If installed, must be Version 0.53.1 or higher. .. note:: From e99e97de653d342ccb53712601aea71e74952f79 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 21 Jul 2022 16:46:50 -0400 Subject: [PATCH 031/136] remove numba from computation extra. --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c5dcd9ddde69a..b5dec5600b24a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,7 +62,6 @@ performance = numba>=0.53.0 numexpr>=2.7.1 computation = - numba>=0.53.0 scipy>=1.7.1 xarray>=0.19.0 aws = From 99f31250cdbc8e7d3b64dd00b965a29cf84b321d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 24 Aug 2022 20:32:50 +0200 Subject: [PATCH 032/136] Backport PR #48197 on branch 1.5.x (DOC: Cleanup 1.5 whatsnew) (#48228) Backport PR #48197: DOC: Cleanup 1.5 whatsnew Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 88 +++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index cbe40cc6a2ea2..9de855dea407d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -154,8 +154,6 @@ from_dummies Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. -Example:: - .. ipython:: python import pandas as pd @@ -308,7 +306,7 @@ Other enhancements - :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`) - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) -- :class:`Series` and :class:`DataFrame` with ``IntegerDtype`` now supports bitwise operations (:issue:`34463`) +- :class:`Series` and :class:`DataFrame` with :class:`IntegerDtype` now supports bitwise operations (:issue:`34463`) - Add ``milliseconds`` field support for :class:`.DateOffset` (:issue:`43371`) - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :func:`concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) @@ -319,7 +317,7 @@ Other enhancements - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) -- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError`, :class:`.PossiblePrecisionLoss`, :class:`.ValueLabelTypeMismatch`, :class:`.InvalidColumnName`, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Add support for :meth:`.GroupBy.ohlc` for extension array dtypes (:issue:`37493`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) @@ -491,16 +489,6 @@ Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would inco Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_150.api_breaking.api_breaking1: - -api_breaking_change1 -^^^^^^^^^^^^^^^^^^^^ - -.. _whatsnew_150.api_breaking.api_breaking2: - -api_breaking_change2 -^^^^^^^^^^^^^^^^^^^^ - .. _whatsnew_150.api_breaking.deps: Increased minimum versions for dependencies @@ -567,7 +555,73 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| | | X | +| beautifulsoup4 |4.9.3 | X | ++-----------------+-----------------+---------+ +| blosc |1.21.0 | X | ++-----------------+-----------------+---------+ +| bottleneck |1.3.2 | X | ++-----------------+-----------------+---------+ +| brotlipy |0.7.0 | | ++-----------------+-----------------+---------+ +| fastparquet |0.4.0 | | ++-----------------+-----------------+---------+ +| fsspec |2021.05.0 | X | ++-----------------+-----------------+---------+ +| html5lib |1.1 | | ++-----------------+-----------------+---------+ +| hypothesis |6.13.0 | X | ++-----------------+-----------------+---------+ +| gcsfs |2021.05.0 | X | ++-----------------+-----------------+---------+ +| jinja2 |3.0.0 | X | ++-----------------+-----------------+---------+ +| lxml |4.6.3 | X | ++-----------------+-----------------+---------+ +| matplotlib |3.3.2 | | ++-----------------+-----------------+---------+ +| numba |0.53.1 | X | ++-----------------+-----------------+---------+ +| numexpr |2.7.3 | X | ++-----------------+-----------------+---------+ +| odfpy |1.4.1 | | ++-----------------+-----------------+---------+ +| openpyxl |3.0.7 | X | ++-----------------+-----------------+---------+ +| pandas-gbq |0.15.0 | X | ++-----------------+-----------------+---------+ +| psycopg2 |2.8.6 | X | ++-----------------+-----------------+---------+ +| pyarrow |1.0.1 | | ++-----------------+-----------------+---------+ +| pymysql |1.0.2 | X | ++-----------------+-----------------+---------+ +| pyreadstat |1.1.2 | X | ++-----------------+-----------------+---------+ +| pytables |3.6.1 | | ++-----------------+-----------------+---------+ +| python-snappy |0.6.0 | | ++-----------------+-----------------+---------+ +| pyxlsb |1.0.8 | X | ++-----------------+-----------------+---------+ +| s3fs |2021.05.0 | X | ++-----------------+-----------------+---------+ +| scipy |1.7.1 | X | ++-----------------+-----------------+---------+ +| sqlalchemy |1.4.16 | X | ++-----------------+-----------------+---------+ +| tabulate |0.8.9 | X | ++-----------------+-----------------+---------+ +| tzdata |2022a | | ++-----------------+-----------------+---------+ +| xarray |0.19.0 | X | ++-----------------+-----------------+---------+ +| xlrd |2.0.1 | | ++-----------------+-----------------+---------+ +| xlsxwriter |1.4.3 | X | ++-----------------+-----------------+---------+ +| xlwt |1.3.0 | | ++-----------------+-----------------+---------+ +| zstandard |0.15.2 | | +-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -703,7 +757,7 @@ retained by specifying ``group_keys=False``. Inplace operation when setting values with ``loc`` and ``iloc`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Most of the time setting values with ``frame.iloc`` attempts to set values +Most of the time setting values with :meth:`DataFrame.iloc` attempts to set values inplace, only falling back to inserting a new array if necessary. There are some cases where this rule is not followed, for example when setting an entire column from an array with different dtype: @@ -924,7 +978,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`) - Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- Bug in :meth:`DataFrame.concat` when concatenating two (or more) unordered ``CategoricalIndex`` variables, whose categories are permutations, yields incorrect index values (:issue:`24845`) +- Bug in :meth:`concat` when concatenating two (or more) unordered :class:`CategoricalIndex` variables, whose categories are permutations, yields incorrect index values (:issue:`24845`) Datetimelike ^^^^^^^^^^^^ From ef3d6b0259329f15b216dd8ffc35cc54d1271755 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Aug 2022 13:12:36 +0200 Subject: [PATCH 033/136] Backport PR #48232 on branch 1.5.x (CI: Ensure jobs run on 1.5.x branch) (#48235) Backport PR #48232: CI: Ensure jobs run on 1.5.x branch Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/32-bit-linux.yml | 2 ++ .github/workflows/code-checks.yml | 2 ++ .github/workflows/docbuild-and-upload.yml | 2 ++ .github/workflows/macos-windows.yml | 2 ++ .github/workflows/python-dev.yml | 2 ++ .github/workflows/sdist.yml | 2 ++ .github/workflows/ubuntu.yml | 2 ++ 7 files changed, 14 insertions(+) diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml index 67e99b4486a12..8c9f0b594f321 100644 --- a/.github/workflows/32-bit-linux.yml +++ b/.github/workflows/32-bit-linux.yml @@ -4,10 +4,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x paths-ignore: - "doc/**" diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 09c603f347d4c..c9c5058fb365c 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,10 +4,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x env: diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 76855b6b9f2b9..5ad146eccb253 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,12 +4,14 @@ on: push: branches: - main + - 1.5.x - 1.4.x tags: - '*' pull_request: branches: - main + - 1.5.x - 1.4.x env: diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index e9503a2486560..9cbd41917110e 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -4,10 +4,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x paths-ignore: - "doc/**" diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 580cafd6e4949..46cb564b494f6 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -24,10 +24,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x paths-ignore: - "doc/**" diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml index 1a06ea31ccbb8..14cede7bc1a39 100644 --- a/.github/workflows/sdist.yml +++ b/.github/workflows/sdist.yml @@ -4,10 +4,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x types: [labeled, opened, synchronize, reopened] paths-ignore: diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index b8268a82d9b70..301e7804ddbd8 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -4,10 +4,12 @@ on: push: branches: - main + - 1.5.x - 1.4.x pull_request: branches: - main + - 1.5.x - 1.4.x paths-ignore: - "doc/**" From 5ae1898f9384319befdd99c92c2a99321209d79b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Aug 2022 22:29:37 +0200 Subject: [PATCH 034/136] Backport PR #48180 on branch 1.5.x (CI: Switch to large for circleci) (#48251) Backport PR #48180: CI: Switch to large for circleci Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0d9e3ade08846..6133037bf3b7d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,7 +4,7 @@ jobs: test-arm: machine: image: ubuntu-2004:202101-01 - resource_class: arm.medium + resource_class: arm.large environment: ENV_FILE: ci/deps/circle-38-arm64.yaml PYTEST_WORKERS: auto From 992cfcd79b7909cb270874afd3c28c96e4181dae Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Aug 2022 00:15:21 +0200 Subject: [PATCH 035/136] Backport PR #48245 on branch 1.5.x (CI: Skip test_round_sanity tests due to failures) (#48257) Backport PR #48245: CI: Skip test_round_sanity tests due to failures Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/scalar/timedelta/test_timedelta.py | 3 +-- pandas/tests/scalar/timestamp/test_unary_ops.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 0dd3a88670ece..21f32cf2d2d1e 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -14,7 +14,6 @@ iNaT, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -691,7 +690,7 @@ def test_round_implementation_bounds(self): with pytest.raises(OverflowError, match=msg): Timedelta.max.ceil("s") - @pytest.mark.xfail(not IS64, reason="Failing on 32 bit build", strict=False) + @pytest.mark.xfail(reason="Failing on builds", strict=False) @given(val=st.integers(min_value=iNaT + 1, max_value=lib.i8max)) @pytest.mark.parametrize( "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index cc11037660ad2..9c376c7a13efc 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -21,7 +21,6 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -from pandas.compat import IS64 import pandas.util._test_decorators as td import pandas._testing as tm @@ -298,7 +297,7 @@ def test_round_implementation_bounds(self): with pytest.raises(OverflowError, match=msg): Timestamp.max.ceil("s") - @pytest.mark.xfail(not IS64, reason="Failing on 32 bit build", strict=False) + @pytest.mark.xfail(reason="Failing on builds", strict=False) @given(val=st.integers(iNaT + 1, lib.i8max)) @pytest.mark.parametrize( "method", [Timestamp.round, Timestamp.floor, Timestamp.ceil] From 9b4eb540a8c4d591b5faa43641dd6e084edd2ba5 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Aug 2022 00:17:48 +0200 Subject: [PATCH 036/136] Backport PR #48240 on branch 1.5.x (Fix mypy erroring on backport branches) (#48259) Backport PR #48240: Fix mypy erroring on backport branches Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/development/contributing_codebase.rst | 6 +++++- scripts/run_stubtest.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index bc85a54e61f22..15931a1a3eb83 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -265,7 +265,11 @@ pandas uses `mypy `_ and `pyright =1.22.0) is required for type validation. diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index cea9665e649d6..8cf5b81ba398c 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -9,7 +9,7 @@ import pandas as pd # fail early if pandas is not installed -if "dev" not in getattr(pd, "__version__", ""): +if not getattr(pd, "__version__", ""): # fail on the CI, soft fail during local development warnings.warn("You need to install the development version of pandas") if pd.compat.is_ci_environment(): From 7313da5449feb6635948e2bceca348c0f4119295 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Aug 2022 14:43:17 +0200 Subject: [PATCH 037/136] Backport PR #48215 on branch 1.5.x (REGR: properly update DataFrame cache in Series.__setitem__) (#48268) Backport PR #48215: REGR: properly update DataFrame cache in Series.__setitem__ Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/series.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index deff6e194c3bd..e03e6cd41ebd3 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` setting a length-1 array like value to a single value in the DataFrame (:issue:`46268`) - Fixed regression when slicing with :meth:`DataFrame.loc` with :class:`DateOffset`-index (:issue:`46671`) - Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`) +- Fixed regression in updating a DataFrame column through Series ``__setitem__`` (using chained assignment) not updating column values inplace and using too much memory (:issue:`47172`) - Fixed regression in :meth:`DataFrame.select_dtypes` returning a view on the original DataFrame (:issue:`48090`) - Fixed regression using custom Index subclasses (for example, used in xarray) with :meth:`~DataFrame.reset_index` or :meth:`Index.insert` (:issue:`47071`) - Fixed regression in :meth:`DatetimeIndex.intersection` when the :class:`DatetimeIndex` has dates crossing daylight savings time (:issue:`46702`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 579177dae827d..d2f66e9bd36e2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1169,7 +1169,7 @@ def __setitem__(self, key, value) -> None: self._set_with(key, value) if cacher_needs_updating: - self._maybe_update_cacher() + self._maybe_update_cacher(inplace=True) def _set_with_engine(self, key, value) -> None: loc = self.index.get_loc(key) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 6d2becd7a32d2..cf0ff4e3603f3 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1235,3 +1235,21 @@ def test_setitem_not_operating_inplace(self, value, set_value, indexer): view = df[:] df[indexer] = set_value tm.assert_frame_equal(view, expected) + + @td.skip_array_manager_invalid_test + def test_setitem_column_update_inplace(self, using_copy_on_write): + # https://github.com/pandas-dev/pandas/issues/47172 + + labels = [f"c{i}" for i in range(10)] + df = DataFrame({col: np.zeros(len(labels)) for col in labels}, index=labels) + values = df._mgr.blocks[0].values + + for label in df.columns: + df[label][label] = 1 + + if not using_copy_on_write: + # diagonal values all updated + assert np.all(values[np.arange(10), np.arange(10)] == 1) + else: + # original dataframe not updated + assert np.all(values[np.arange(10), np.arange(10)] == 0) From f46704cffd7cc89ee46e728cbcf440c500643a04 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 00:45:39 +0200 Subject: [PATCH 038/136] Backport PR #48272 on branch 1.5.x (CI: Require s3fs greater than minumum version in builds) (#48276) Backport PR #48272: CI: Require s3fs greater than minumum version in builds Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- ci/deps/actions-310.yaml | 3 ++- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38.yaml | 3 ++- ci/deps/actions-39.yaml | 3 ++- ci/deps/circle-38-arm64.yaml | 3 ++- environment.yml | 2 +- requirements-dev.txt | 2 +- 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index e12be72494cc4..d986a289ad75a 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -19,6 +19,7 @@ dependencies: - pytz # optional dependencies + - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -43,7 +44,7 @@ dependencies: - pyreadstat - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 9c38f81de3f96..7ebf247ef1bbf 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -44,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 5b55bf7454030..81c1492964b48 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -19,6 +19,7 @@ dependencies: - pytz # optional dependencies + - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -43,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 10e1d8117df87..bee89a1e91547 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -19,6 +19,7 @@ dependencies: - pytz # optional dependencies + - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -43,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 1c614729331e2..86db3b4002813 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -19,6 +19,7 @@ dependencies: - pytz # optional dependencies + - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -44,7 +45,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/environment.yml b/environment.yml index f1472f453b935..8e917e5803674 100644 --- a/environment.yml +++ b/environment.yml @@ -44,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs + - s3fs>=2021.05.0 - scipy - sqlalchemy - tabulate diff --git a/requirements-dev.txt b/requirements-dev.txt index 60dd738e43ba3..c9a0ae1ec0045 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,7 +35,7 @@ pyreadstat tables python-snappy pyxlsb -s3fs +s3fs>=2021.05.0 scipy sqlalchemy tabulate From 97cf8e2e0dbb8c8d0542e5a1340b146c8705a698 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 02:49:32 +0200 Subject: [PATCH 039/136] Backport PR #48299 on branch 1.5.x (Bump s3fs to 2021.08.00) (#48305) Backport PR #48299: Bump s3fs to 2021.08.00 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- ci/deps/actions-310.yaml | 3 +-- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 6 +++--- ci/deps/actions-38.yaml | 3 +-- ci/deps/actions-39.yaml | 3 +-- ci/deps/circle-38-arm64.yaml | 3 +-- doc/source/getting_started/install.rst | 6 +++--- doc/source/whatsnew/v1.5.0.rst | 12 ++++++------ environment.yml | 2 +- pandas/compat/_optional.py | 6 +++--- requirements-dev.txt | 2 +- 11 files changed, 22 insertions(+), 26 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d986a289ad75a..da3578e7191eb 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -19,7 +19,6 @@ dependencies: - pytz # optional dependencies - - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -44,7 +43,7 @@ dependencies: - pyreadstat - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 7ebf247ef1bbf..29ad2669afbd2 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -44,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index b92d8e97d6071..fd23080c2ab04 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck=1.3.2 - brotlipy=0.7.0 - fastparquet=0.4.0 - - fsspec=2021.05.0 + - fsspec=2021.07.0 - html5lib=1.1 - hypothesis=6.13.0 - - gcsfs=2021.05.0 + - gcsfs=2021.07.0 - jinja2=3.0.0 - lxml=4.6.3 - matplotlib=3.3.2 @@ -45,7 +45,7 @@ dependencies: - pytables=3.6.1 - python-snappy=0.6.0 - pyxlsb=1.0.8 - - s3fs=2021.05.0 + - s3fs=2021.08.0 - scipy=1.7.1 - sqlalchemy=1.4.16 - tabulate=0.8.9 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 81c1492964b48..b478b7c900425 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -19,7 +19,6 @@ dependencies: - pytz # optional dependencies - - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -44,7 +43,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index bee89a1e91547..a12f36ba84cca 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -19,7 +19,6 @@ dependencies: - pytz # optional dependencies - - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -44,7 +43,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 86db3b4002813..2b65ece881df7 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -19,7 +19,6 @@ dependencies: - pytz # optional dependencies - - aiobotocore<2.0.0 - beautifulsoup4 - blosc - bottleneck @@ -45,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e24528e611c12..00251854e3ffa 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -413,10 +413,10 @@ Access data in the cloud ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -fsspec 2021.5.0 Handling files aside from simple local and HTTP -gcsfs 2021.5.0 Google Cloud Storage access +fsspec 2021.7.0 Handling files aside from simple local and HTTP +gcsfs 2021.7.0 Google Cloud Storage access pandas-gbq 0.15.0 Google Big Query access -s3fs 2021.05.0 Amazon S3 access +s3fs 2021.08.0 Amazon S3 access ========================= ================== ============================================================= Clipboard diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9de855dea407d..6721523b9b429 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -509,11 +509,11 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | bottleneck | 1.3.2 | | X | +-----------------+-----------------+----------+---------+ -| fsspec | 2021.05.0 | | X | +| fsspec | 2021.07.0 | | X | +-----------------+-----------------+----------+---------+ | hypothesis | 6.13.0 | | X | +-----------------+-----------------+----------+---------+ -| gcsfs | 2021.05.0 | | X | +| gcsfs | 2021.07.0 | | X | +-----------------+-----------------+----------+---------+ | jinja2 | 3.0.0 | | X | +-----------------+-----------------+----------+---------+ @@ -535,7 +535,7 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | pyxlsb | 1.0.8 | | X | +-----------------+-----------------+----------+---------+ -| s3fs | 2021.05.0 | | X | +| s3fs | 2021.08.0 | | X | +-----------------+-----------------+----------+---------+ | scipy | 1.7.1 | | X | +-----------------+-----------------+----------+---------+ @@ -565,13 +565,13 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | fastparquet |0.4.0 | | +-----------------+-----------------+---------+ -| fsspec |2021.05.0 | X | +| fsspec |2021.08.0 | X | +-----------------+-----------------+---------+ | html5lib |1.1 | | +-----------------+-----------------+---------+ | hypothesis |6.13.0 | X | +-----------------+-----------------+---------+ -| gcsfs |2021.05.0 | X | +| gcsfs |2021.08.0 | X | +-----------------+-----------------+---------+ | jinja2 |3.0.0 | X | +-----------------+-----------------+---------+ @@ -603,7 +603,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pyxlsb |1.0.8 | X | +-----------------+-----------------+---------+ -| s3fs |2021.05.0 | X | +| s3fs |2021.08.0 | X | +-----------------+-----------------+---------+ | scipy |1.7.1 | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index 8e917e5803674..90b6694a392bf 100644 --- a/environment.yml +++ b/environment.yml @@ -44,7 +44,7 @@ dependencies: - pytables - python-snappy - pyxlsb - - s3fs>=2021.05.0 + - s3fs>=2021.08.0 - scipy - sqlalchemy - tabulate diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c2d1927bccfff..4f4291c338dd5 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,10 +18,10 @@ "bottleneck": "1.3.2", "brotli": "0.7.0", "fastparquet": "0.4.0", - "fsspec": "2021.05.0", + "fsspec": "2021.07.0", "html5lib": "1.1", "hypothesis": "6.13.0", - "gcsfs": "2021.05.0", + "gcsfs": "2021.07.0", "jinja2": "3.0.0", "lxml.etree": "4.6.3", "matplotlib": "3.3.2", @@ -36,7 +36,7 @@ "pyreadstat": "1.1.2", "pytest": "6.0", "pyxlsb": "1.0.8", - "s3fs": "2021.05.0", + "s3fs": "2021.08.0", "scipy": "1.7.1", "snappy": "0.6.0", "sqlalchemy": "1.4.16", diff --git a/requirements-dev.txt b/requirements-dev.txt index c9a0ae1ec0045..39118b750fa8c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,7 +35,7 @@ pyreadstat tables python-snappy pyxlsb -s3fs>=2021.05.0 +s3fs>=2021.08.0 scipy sqlalchemy tabulate From 3ca5773d33d8096fc0b45f353a9a4420ff65dcee Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 10:06:10 +0200 Subject: [PATCH 040/136] Backport PR #48027 on branch 1.5.x (ENH: Support masks in groupby prod) (#48302) Backport PR #48027: ENH: Support masks in groupby prod Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/_libs/groupby.pyi | 6 +++-- pandas/_libs/groupby.pyx | 34 ++++++++++++++++++++++------ pandas/core/groupby/ops.py | 21 ++++++++++++----- pandas/tests/groupby/test_groupby.py | 19 ++++++++++++---- 5 files changed, 61 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6721523b9b429..711352775400e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1165,7 +1165,7 @@ Groupby/resample/rolling - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) - Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) -- Bug in :meth:`.GroupBy.sum` and :meth:`.GroupBy.cumsum` with integer dtypes losing precision (:issue:`37493`) +- Bug in :meth:`.GroupBy.sum`, :meth:`.GroupBy.prod` and :meth:`.GroupBy.cumsum` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) - Bug in :meth:`.GroupBy.cumsum` with integer dtypes causing overflows when sum was bigger than maximum of dtype (:issue:`37493`) - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index c8e9df6cd6b38..04db0c9b90bc5 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -63,10 +63,12 @@ def group_sum( is_datetimelike: bool = ..., ) -> None: ... def group_prod( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # int64float_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[floating, ndim=2] + values: np.ndarray, # ndarray[int64float_t, ndim=2] labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None, + result_mask: np.ndarray | None = ..., min_count: int = ..., ) -> None: ... def group_var( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 563abf949dbbc..299dfdf177d91 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -682,10 +682,12 @@ def group_sum( @cython.wraparound(False) @cython.boundscheck(False) def group_prod( - floating[:, ::1] out, + int64float_t[:, ::1] out, int64_t[::1] counts, - ndarray[floating, ndim=2] values, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, + const uint8_t[:, ::1] mask, + uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, ) -> None: """ @@ -693,10 +695,11 @@ def group_prod( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, ::1] prodx + int64float_t val, count + int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -716,15 +719,32 @@ def group_prod( for j in range(K): val = values[i, j] - # not nan - if val == val: + if uses_mask: + isna_entry = mask[i, j] + elif int64float_t is float32_t or int64float_t is float64_t: + isna_entry = not val == val + else: + isna_entry = False + + if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = NAN + + # else case is not possible + if uses_mask: + result_mask[i, j] = True + # Be deterministic, out was initialized as empty + out[i, j] = 0 + elif int64float_t is float32_t or int64float_t is float64_t: + out[i, j] = NAN + else: + # we only get here when < mincount which gets handled later + pass + else: out[i, j] = prodx[i, j] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 540825b33c073..418a222a0bfa6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -159,6 +159,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "sum", "ohlc", "cumsum", + "prod", } _cython_arity = {"ohlc": 4} # OHLC @@ -221,13 +222,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["var", "prod", "mean"] or ( + if how in ["var", "mean"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "cumsum"]: + elif how in ["sum", "ohlc", "prod", "cumsum"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) @@ -597,8 +598,16 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) - elif self.how == "ohlc": - func(result, counts, values, comp_ids, min_count, mask, result_mask) + elif self.how in ["ohlc", "prod"]: + func( + result, + counts, + values, + comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + ) else: func(result, counts, values, comp_ids, min_count, **kwargs) else: @@ -631,8 +640,8 @@ def _call_cython_op( # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: - # Neutral value for sum is 0, so don't fill empty groups with nan - cutoff = max(0 if self.how == "sum" else 1, min_count) + # if the op keeps the int dtypes, we have to use 0 + cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count) empty_groups = counts < cutoff if empty_groups.any(): if result_mask is not None and self.uses_mask(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1af94434ca1fa..ba39f76203623 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2847,8 +2847,8 @@ def test_single_element_list_grouping(): values, _ = next(iter(df.groupby(["a"]))) -@pytest.mark.parametrize("func", ["sum", "cumsum"]) -def test_groupby_sum_avoid_casting_to_float(func): +@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"]) +def test_groupby_avoid_casting_to_float(func): # GH#37493 val = 922337203685477580 df = DataFrame({"a": 1, "b": [val]}) @@ -2859,12 +2859,13 @@ def test_groupby_sum_avoid_casting_to_float(func): tm.assert_frame_equal(result, expected) -def test_groupby_sum_support_mask(any_numeric_ea_dtype): +@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)]) +def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val): # GH#37493 df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) - result = df.groupby("a").sum() + result = getattr(df.groupby("a"), func)() expected = DataFrame( - {"b": [3]}, + {"b": [val]}, index=Index([1], name="a", dtype=any_numeric_ea_dtype), dtype=any_numeric_ea_dtype, ) @@ -2887,6 +2888,14 @@ def test_groupby_overflow(val, dtype): expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64") tm.assert_frame_equal(result, expected) + result = df.groupby("a").prod() + expected = DataFrame( + {"b": [val * val]}, + index=Index([1], name="a", dtype=f"{dtype}64"), + dtype=f"{dtype}64", + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)]) def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val): From 46f7167da30972c50d6eabe47a633753ecf2a017 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:11:35 +0200 Subject: [PATCH 041/136] Backport PR #47762 on branch 1.5.x (REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array) (#48309) Backport PR #47762: REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/internals/concat.py | 25 +++--- .../extension/array_with_attr/__init__.py | 6 ++ .../tests/extension/array_with_attr/array.py | 84 +++++++++++++++++++ .../array_with_attr/test_array_with_attr.py | 33 ++++++++ 5 files changed, 139 insertions(+), 10 deletions(-) create mode 100644 pandas/tests/extension/array_with_attr/__init__.py create mode 100644 pandas/tests/extension/array_with_attr/array.py create mode 100644 pandas/tests/extension/array_with_attr/test_array_with_attr.py diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index e03e6cd41ebd3..2ce4d4b37f922 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`) - Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`) - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) +- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`) - Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`) - Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`) - Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 77197dac3363b..0df8aa5a055b0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -476,16 +476,21 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): - empty_dtype = cast(ExtensionDtype, empty_dtype) - cls = empty_dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=empty_dtype) - ncols, nrows = self.shape - assert ncols == 1, ncols - empty_arr = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take( - empty_arr, allow_fill=True, fill_value=fill_value - ) + if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: + # avoid creating new empty array if we already have an array + # with correct dtype that can be reindexed + pass + else: + empty_dtype = cast(ExtensionDtype, empty_dtype) + cls = empty_dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=empty_dtype) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) diff --git a/pandas/tests/extension/array_with_attr/__init__.py b/pandas/tests/extension/array_with_attr/__init__.py new file mode 100644 index 0000000000000..49da6af024a31 --- /dev/null +++ b/pandas/tests/extension/array_with_attr/__init__.py @@ -0,0 +1,6 @@ +from pandas.tests.extension.array_with_attr.array import ( + FloatAttrArray, + FloatAttrDtype, +) + +__all__ = ["FloatAttrArray", "FloatAttrDtype"] diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py new file mode 100644 index 0000000000000..d9327ca9f2f3f --- /dev/null +++ b/pandas/tests/extension/array_with_attr/array.py @@ -0,0 +1,84 @@ +""" +Test extension array that has custom attribute information (not stored on the dtype). + +""" +from __future__ import annotations + +import numbers + +import numpy as np + +from pandas._typing import type_t + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class FloatAttrDtype(ExtensionDtype): + type = float + name = "float_attr" + na_value = np.nan + + @classmethod + def construct_array_type(cls) -> type_t[FloatAttrArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatAttrArray + + +class FloatAttrArray(ExtensionArray): + dtype = FloatAttrDtype() + __array_priority__ = 1000 + + def __init__(self, values, attr=None) -> None: + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array of float64 dtype as values") + if not values.dtype == "float64": + raise TypeError("Need to pass a numpy array of float64 dtype as values") + self.data = values + self.attr = attr + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.array(scalars, dtype="float64", copy=copy) + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + item = pd.api.indexers.check_array_indexer(self, item) + return type(self)(self.data[item], self.attr) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.isnan(self.data) + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self.data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) + return type(self)(result, self.attr) + + def copy(self): + return type(self)(self.data.copy(), self.attr) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + attr = to_concat[0].attr if len(to_concat) else None + return cls(data, attr) diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py new file mode 100644 index 0000000000000..3735fe40a0d67 --- /dev/null +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -0,0 +1,33 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.array_with_attr import FloatAttrArray + + +def test_concat_with_all_na(): + # https://github.com/pandas-dev/pandas/pull/47762 + # ensure that attribute of the column array is preserved (when it gets + # preserved in reindexing the array) during merge/concat + arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") + + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]}) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" + + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]}) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" + + result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1) + expected = pd.DataFrame( + {"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" From c469b59036415a56c312dc48183f096c8b4f1660 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:24:55 +0200 Subject: [PATCH 042/136] Backport PR #48246 on branch 1.5.x (REGR: iloc not possible for sparse DataFrame) (#48311) Backport PR #48246: REGR: iloc not possible for sparse DataFrame Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.4.rst | 2 +- pandas/core/internals/managers.py | 17 +++++++++++++++-- pandas/tests/indexing/test_loc.py | 8 ++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 2ce4d4b37f922..d438c5705f2df 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -33,7 +33,7 @@ Fixed regressions - Fixed regression in :meth:`DatetimeIndex.intersection` when the :class:`DatetimeIndex` has dates crossing daylight savings time (:issue:`46702`) - Fixed regression in :func:`merge` throwing an error when passing a :class:`Series` with a multi-level name (:issue:`47946`) - Fixed regression in :meth:`DataFrame.eval` creating a copy when updating inplace (:issue:`47449`) -- +- Fixed regression where getting a row using :meth:`DataFrame.iloc` with :class:`SparseDtype` would raise (:issue:`46406`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3084bcea49f05..9f4c799941afd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1085,11 +1085,20 @@ def fast_xs(self, loc: int) -> SingleBlockManager: dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) n = len(self) - if isinstance(dtype, ExtensionDtype): + + # GH#46406 + immutable_ea = isinstance(dtype, SparseDtype) + + if isinstance(dtype, ExtensionDtype) and not immutable_ea: cls = dtype.construct_array_type() result = cls._empty((n,), dtype=dtype) else: - result = np.empty(n, dtype=dtype) + # error: Argument "dtype" to "empty" has incompatible type + # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected + # "None" + result = np.empty( + n, dtype=object if immutable_ea else dtype # type: ignore[arg-type] + ) result = ensure_wrapped_if_datetimelike(result) for blk in self.blocks: @@ -1098,6 +1107,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager: for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) + if immutable_ea: + dtype = cast(ExtensionDtype, dtype) + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + block = new_block(result, placement=slice(0, len(result)), ndim=1) return SingleBlockManager(block, self.axes[0]) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index cf7db65015fa7..4e5571c7087e7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1341,6 +1341,14 @@ def test_loc_getitem_sparse_series(self): expected = Series([1.0, 0.0], dtype=SparseDtype("float64", 0.0)) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("indexer", ["loc", "iloc"]) + def test_getitem_single_row_sparse_df(self, indexer): + # GH#46406 + df = DataFrame([[1.0, 0.0, 1.5], [0.0, 2.0, 0.0]], dtype=SparseDtype(float)) + result = getattr(df, indexer)[0] + expected = Series([1.0, 0.0, 1.5], dtype=SparseDtype(float), name=0) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("key_type", [iter, np.array, Series, Index]) def test_loc_getitem_iterable(self, float_frame, key_type): idx = key_type(["A", "B", "C"]) From 1196b8de6d21a0868594129ddacccb75660a7237 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Aug 2022 23:55:14 +0200 Subject: [PATCH 043/136] Backport PR #48314 on branch 1.5.x (DOC: v1.4.4 release date and tidy up release notes) (#48320) Backport PR #48314: DOC: v1.4.4 release date and tidy up release notes Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.4.rst | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index d438c5705f2df..56b1254d8a359 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -1,7 +1,7 @@ .. _whatsnew_144: -What's new in 1.4.4 (July ??, 2022) ------------------------------------ +What's new in 1.4.4 (August 31, 2022) +------------------------------------- These are the changes in pandas 1.4.4. See :ref:`release` for a full changelog including other versions of pandas. @@ -14,23 +14,23 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`) +- Fixed regression in :meth:`DataFrame.fillna` not working on a :class:`DataFrame` with a :class:`MultiIndex` (:issue:`47649`) - Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`) -- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) +- Fixed regression in :func:`concat` materializing the :class:`Index` during sorting even if the :class:`Index` was already sorted (:issue:`47501`) - Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`) - Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`) -- Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`) +- Fixed regression in :func:`cut` when using a ``datetime64`` IntervalIndex as bins (:issue:`46218`) - Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`) - Fixed regression in :meth:`DataFrame.loc` raising error when indexing with a ``NamedTuple`` (:issue:`48124`) - Fixed regression in :meth:`DataFrame.loc` not updating the cache correctly after values were set (:issue:`47867`) - Fixed regression in :meth:`DataFrame.loc` not aligning index in some cases when setting a :class:`DataFrame` (:issue:`47578`) - Fixed regression in :meth:`DataFrame.loc` setting a length-1 array like value to a single value in the DataFrame (:issue:`46268`) -- Fixed regression when slicing with :meth:`DataFrame.loc` with :class:`DateOffset`-index (:issue:`46671`) +- Fixed regression when slicing with :meth:`DataFrame.loc` with :class:`DatetimeIndex` with a :class:`.DateOffset` object for its ``freq`` (:issue:`46671`) - Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`) - Fixed regression in updating a DataFrame column through Series ``__setitem__`` (using chained assignment) not updating column values inplace and using too much memory (:issue:`47172`) - Fixed regression in :meth:`DataFrame.select_dtypes` returning a view on the original DataFrame (:issue:`48090`) - Fixed regression using custom Index subclasses (for example, used in xarray) with :meth:`~DataFrame.reset_index` or :meth:`Index.insert` (:issue:`47071`) -- Fixed regression in :meth:`DatetimeIndex.intersection` when the :class:`DatetimeIndex` has dates crossing daylight savings time (:issue:`46702`) +- Fixed regression in :meth:`~Index.intersection` when the :class:`DatetimeIndex` has dates crossing daylight savings time (:issue:`46702`) - Fixed regression in :func:`merge` throwing an error when passing a :class:`Series` with a multi-level name (:issue:`47946`) - Fixed regression in :meth:`DataFrame.eval` creating a copy when updating inplace (:issue:`47449`) - Fixed regression where getting a row using :meth:`DataFrame.iloc` with :class:`SparseDtype` would raise (:issue:`46406`) @@ -41,10 +41,10 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`) +- The ``FutureWarning`` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`) - Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`) -- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`) -- Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`46383`) +- Bug when getting values with :meth:`DataFrame.loc` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`) .. --------------------------------------------------------------------------- @@ -54,7 +54,6 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.32`` (:issue:`47978`) -- .. --------------------------------------------------------------------------- From 9115f68ef45ae343dd42a8ed2d84ad27a7c4b56d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Aug 2022 13:22:40 +0200 Subject: [PATCH 044/136] Backport PR #48301 on branch 1.5.x (DEPR: Deprecate positional arguments in pivot) (#48326) Backport PR #48301: DEPR: Deprecate positional arguments in pivot Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 1 + pandas/core/reshape/pivot.py | 2 ++ pandas/tests/reshape/test_pivot.py | 20 +++++++++++--------- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 711352775400e..5471beba44486 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -911,6 +911,7 @@ Other Deprecations - Deprecated :attr:`Timedelta.freq` and :attr:`Timedelta.is_populated` (:issue:`46430`) - Deprecated :attr:`Timedelta.delta` (:issue:`46476`) - Deprecated passing arguments as positional in :meth:`DataFrame.any` and :meth:`Series.any` (:issue:`44802`) +- Deprecated passing positional arguments to :meth:`DataFrame.pivot` and :func:`pivot` except ``data`` (:issue:`30228`) - Deprecated the methods :meth:`DataFrame.mad`, :meth:`Series.mad`, and the corresponding groupby methods (:issue:`11787`) - Deprecated positional arguments to :meth:`Index.join` except for ``other``, use keyword-only arguments instead of positional arguments (:issue:`46518`) - Deprecated positional arguments to :meth:`StringMethods.rsplit` and :meth:`StringMethods.split` except for ``pat``, use keyword-only arguments instead of positional arguments (:issue:`47423`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4302b14da6418..fc2df4c1179e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8579,6 +8579,7 @@ def groupby( @Substitution("") @Appender(_shared_docs["pivot"]) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def pivot(self, index=None, columns=None, values=None) -> DataFrame: from pandas.core.reshape.pivot import pivot diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 867835ef7f0a3..b4a2b8d0e52f4 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -19,6 +19,7 @@ from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, ) from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -472,6 +473,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) +@deprecate_nonkeyword_arguments(version=None, allowed_args=["data"]) def pivot( data: DataFrame, index: IndexLabel | None = None, diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 0322ed161c83c..30859e9fdafc0 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -480,9 +480,11 @@ def test_pivot_index_with_nan(self, method): } ) if method: - result = df.pivot("a", "b", "c") + with tm.assert_produces_warning(FutureWarning): + result = df.pivot("a", columns="b", values="c") else: - result = pd.pivot(df, "a", "b", "c") + with tm.assert_produces_warning(FutureWarning): + result = pd.pivot(df, "a", columns="b", values="c") expected = DataFrame( [ [nan, nan, 17, nan], @@ -494,7 +496,7 @@ def test_pivot_index_with_nan(self, method): columns=Index(["C1", "C2", "C3", "C4"], name="b"), ) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) + tm.assert_frame_equal(df.pivot(index="b", columns="a", values="c"), expected.T) @pytest.mark.parametrize("method", [True, False]) def test_pivot_index_with_nan_dates(self, method): @@ -510,18 +512,18 @@ def test_pivot_index_with_nan_dates(self, method): df.loc[1, "b"] = df.loc[4, "b"] = np.nan if method: - pv = df.pivot("a", "b", "c") + pv = df.pivot(index="a", columns="b", values="c") else: - pv = pd.pivot(df, "a", "b", "c") + pv = pd.pivot(df, index="a", columns="b", values="c") assert pv.notna().values.sum() == len(df) for _, row in df.iterrows(): assert pv.loc[row["a"], row["b"]] == row["c"] if method: - result = df.pivot("b", "a", "c") + result = df.pivot(index="b", columns="a", values="c") else: - result = pd.pivot(df, "b", "a", "c") + result = pd.pivot(df, index="b", columns="a", values="c") tm.assert_frame_equal(result, pv.T) @pytest.mark.filterwarnings("ignore:Timestamp.freq is deprecated:FutureWarning") @@ -2275,11 +2277,11 @@ def test_pivot_duplicates(self): } ) with pytest.raises(ValueError, match="duplicate entries"): - data.pivot("a", "b", "c") + data.pivot(index="a", columns="b", values="c") def test_pivot_empty(self): df = DataFrame(columns=["a", "b", "c"]) - result = df.pivot("a", "b", "c") + result = df.pivot(index="a", columns="b", values="c") expected = DataFrame() tm.assert_frame_equal(result, expected, check_names=False) From c65dc3cc7efbd4a380c630c04874495d98e7e78f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Aug 2022 17:34:27 +0200 Subject: [PATCH 045/136] Backport PR #48214 on branch 1.5.x (WEB: Removing links to pdf version of the docs from web and docs) (#48242) Backport PR #48214: WEB: Removing links to pdf version of the docs from web and docs --- doc/source/index.rst.template | 2 +- web/pandas/index.html | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 022ff9edc1518..59280536536db 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -10,7 +10,7 @@ pandas documentation **Date**: |today| **Version**: |version| -**Download documentation**: `PDF Version `__ | `Zipped HTML `__ +**Download documentation**: `Zipped HTML `__ **Previous versions**: Documentation of previous pandas versions is available at `pandas.pydata.org `__. diff --git a/web/pandas/index.html b/web/pandas/index.html index 98f91f8d0a359..4c4c593fb76c6 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -66,7 +66,6 @@

Latest version: {{ releases[0].name }}

  • What's new in {{ releases[0].name }}
  • Release date:
    {{ releases[0].published.strftime("%b %d, %Y") }}
  • Documentation (web)
  • -
  • Documentation (pdf)
  • Download source code
  • {% endif %} @@ -100,7 +99,6 @@

    Previous versions

    {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
    changelog | docs | - pdf | code {% endfor %} @@ -116,7 +114,6 @@

    Previous versions

    {{ release.name }} ({{ release.published.strftime("%Y-%m-%d") }})
    changelog | docs | - pdf | code {% endfor %} From 401c32b12e7d8cb11898bd945e1e3d2c78258a2f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Aug 2022 17:38:14 +0200 Subject: [PATCH 046/136] Backport PR #48159 on branch 1.5.x (TST: Fix interchange/plotting/groupby test warnings) (#48279) Backport PR #48159: TST: Fix interchange/plotting/groupby test warnings Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/groupby/generic.py | 8 ++++-- pandas/core/groupby/groupby.py | 5 +++- pandas/core/interchange/from_dataframe.py | 2 +- pandas/plotting/_matplotlib/core.py | 3 ++- pandas/plotting/_matplotlib/hist.py | 1 - pandas/plotting/_matplotlib/misc.py | 5 ++-- pandas/tests/groupby/test_counting.py | 25 +++++++++---------- pandas/tests/groupby/test_function.py | 4 +-- pandas/tests/io/sas/test_sas7bdat.py | 2 +- .../tests/plotting/frame/test_hist_box_by.py | 22 ++++++++++------ 10 files changed, 45 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cd91e89554b67..7fe1d55ba55be 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1634,7 +1634,9 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmax" - result = self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) return result @@ -1673,7 +1675,9 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmin" - result = self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 16ee154156616..89c9f3701a424 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1040,7 +1040,10 @@ def curried(x): return self._obj_with_exclusions result = self._python_apply_general( - curried, self._obj_with_exclusions, is_transform=is_transform + curried, + self._obj_with_exclusions, + is_transform=is_transform, + not_indexed_same=not is_transform, ) if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1: diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 6e1b2de10e8e6..4602819b4834a 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -497,7 +497,7 @@ def set_nulls( null_pos = None if null_kind == ColumnNullType.USE_SENTINEL: - null_pos = data == sentinel_val + null_pos = pd.Series(data) == sentinel_val elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 7d8c7da6dd9aa..0b6e5b346062a 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -56,6 +56,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by +from pandas.plotting._matplotlib.misc import unpack_single_str_list from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( decorate_axes, @@ -177,7 +178,7 @@ def __init__( # For `hist` plot, need to get grouped original data before `self.data` is # updated later if self.by is not None and self._kind == "hist": - self._grouped = data.groupby(self.by) + self._grouped = data.groupby(unpack_single_str_list(self.by)) self.kind = kind diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 3ca00ae41d587..d69f68d9e0b66 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -63,7 +63,6 @@ def __init__( MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - # calculate bin number separately in different subplots # where subplots are created based on by argument if is_integer(self.bins): diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 4b74b067053a6..633cb63664823 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -479,7 +479,6 @@ def r(h): def unpack_single_str_list(keys): # GH 42795 - if isinstance(keys, list): - if len(keys) == 1 and isinstance(keys[0], str): - keys = keys[0] + if isinstance(keys, list) and len(keys) == 1: + keys = keys[0] return keys diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index f0a3219d0b419..7e7f1a628da6e 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -188,21 +188,20 @@ def test_ngroup_cumcount_pair(self): tm.assert_series_equal(g.ngroup(), Series(ngroupd)) tm.assert_series_equal(g.cumcount(), Series(cumcounted)) - def test_ngroup_respects_groupby_order(self): + def test_ngroup_respects_groupby_order(self, sort): np.random.seed(0) df = DataFrame({"a": np.random.choice(list("abcdef"), 100)}) - for sort_flag in (False, True): - g = df.groupby(["a"], sort=sort_flag) - df["group_id"] = -1 - df["group_index"] = -1 - - for i, (_, group) in enumerate(g): - df.loc[group.index, "group_id"] = i - for j, ind in enumerate(group.index): - df.loc[ind, "group_index"] = j - - tm.assert_series_equal(Series(df["group_id"].values), g.ngroup()) - tm.assert_series_equal(Series(df["group_index"].values), g.cumcount()) + g = df.groupby("a", sort=sort) + df["group_id"] = -1 + df["group_index"] = -1 + + for i, (_, group) in enumerate(g): + df.loc[group.index, "group_id"] = i + for j, ind in enumerate(group.index): + df.loc[ind, "group_index"] = j + + tm.assert_series_equal(Series(df["group_id"].values), g.ngroup()) + tm.assert_series_equal(Series(df["group_index"].values), g.cumcount()) @pytest.mark.parametrize( "datetimelike", diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 90b29a022f801..7ba22c09cd26d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1590,11 +1590,11 @@ def test_corrwith_with_1_axis(): tm.assert_series_equal(result, expected) -@pytest.mark.filterwarnings("ignore:The 'mad' method.*:FutureWarning") +@pytest.mark.filterwarnings("ignore:.* is deprecated:FutureWarning") def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"]) + gb = df.groupby(["a", "b", "c"], group_keys=False) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 41b2e78d093ea..2b7ecbcdf9f80 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -33,7 +33,7 @@ def data_test_ix(request, dirpath): for k in range(df.shape[1]): col = df.iloc[:, k] if col.dtype == np.int64: - df.iloc[:, k] = df.iloc[:, k].astype(np.float64) + df.isetitem(k, df.iloc[:, k].astype(np.float64)) return df, test_ix diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index e568016c858fd..999118144b58d 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -83,7 +83,9 @@ class TestHistWithBy(TestPlotBase): ) def test_hist_plot_by_argument(self, by, column, titles, legends, hist_df): # GH 15079 - axes = _check_plot_works(hist_df.plot.hist, column=column, by=by) + axes = _check_plot_works( + hist_df.plot.hist, column=column, by=by, default_axes=True + ) result_titles = [ax.get_title() for ax in axes] result_legends = [ [legend.get_text() for legend in ax.get_legend().texts] for ax in axes @@ -120,7 +122,7 @@ def test_hist_plot_by_0(self, by, column, titles, legends, hist_df): df = hist_df.copy() df = df.rename(columns={"C": 0}) - axes = _check_plot_works(df.plot.hist, column=column, by=by) + axes = _check_plot_works(df.plot.hist, default_axes=True, column=column, by=by) result_titles = [ax.get_title() for ax in axes] result_legends = [ [legend.get_text() for legend in ax.get_legend().texts] for ax in axes @@ -142,7 +144,9 @@ def test_hist_plot_empty_list_string_tuple_by(self, by, column, hist_df): # GH 15079 msg = "No group keys passed" with pytest.raises(ValueError, match=msg): - _check_plot_works(hist_df.plot.hist, column=column, by=by) + _check_plot_works( + hist_df.plot.hist, default_axes=True, column=column, by=by + ) @pytest.mark.slow @pytest.mark.parametrize( @@ -274,7 +278,9 @@ class TestBoxWithBy(TestPlotBase): ) def test_box_plot_by_argument(self, by, column, titles, xticklabels, hist_df): # GH 15079 - axes = _check_plot_works(hist_df.plot.box, column=column, by=by) + axes = _check_plot_works( + hist_df.plot.box, default_axes=True, column=column, by=by + ) result_titles = [ax.get_title() for ax in axes] result_xticklabels = [ [label.get_text() for label in ax.get_xticklabels()] for ax in axes @@ -313,7 +319,7 @@ def test_box_plot_by_0(self, by, column, titles, xticklabels, hist_df): df = hist_df.copy() df = df.rename(columns={"C": 0}) - axes = _check_plot_works(df.plot.box, column=column, by=by) + axes = _check_plot_works(df.plot.box, default_axes=True, column=column, by=by) result_titles = [ax.get_title() for ax in axes] result_xticklabels = [ [label.get_text() for label in ax.get_xticklabels()] for ax in axes @@ -335,7 +341,7 @@ def test_box_plot_with_none_empty_list_by(self, by, column, hist_df): # GH 15079 msg = "No group keys passed" with pytest.raises(ValueError, match=msg): - _check_plot_works(hist_df.plot.box, column=column, by=by) + _check_plot_works(hist_df.plot.box, default_axes=True, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize( @@ -351,7 +357,9 @@ def test_box_plot_with_none_empty_list_by(self, by, column, hist_df): ) def test_box_plot_layout_with_by(self, by, column, layout, axes_num, hist_df): # GH 15079 - axes = _check_plot_works(hist_df.plot.box, column=column, by=by, layout=layout) + axes = _check_plot_works( + hist_df.plot.box, default_axes=True, column=column, by=by, layout=layout + ) self._check_axes_shape(axes, axes_num=axes_num, layout=layout) @pytest.mark.parametrize( From 31de945c7f05841ba8716ce782a8e77b20437c92 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 1 Sep 2022 10:56:30 +0200 Subject: [PATCH 047/136] Backport PR #48324 on branch 1.5.x (BUG: Add note in whatsnew for DataFrame.at behavior change) (#48345) Backport PR #48324: BUG: Add note in whatsnew for DataFrame.at behavior change Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/indexing/test_at.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 5471beba44486..75941a3d8edef 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1081,7 +1081,7 @@ Indexing - Bug in :meth:`DataFrame.sum` min_count changes dtype if input contains NaNs (:issue:`46947`) - Bug in :class:`IntervalTree` that lead to an infinite recursion. (:issue:`46658`) - Bug in :class:`PeriodIndex` raising ``AttributeError`` when indexing on ``NA``, rather than putting ``NaT`` in its place. (:issue:`46673`) -- +- Bug in :meth:`DataFrame.at` would allow the modification of multiple columns (:issue:`48296`) Missing ^^^^^^^ diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 96c73b007cef3..1e502ca70189a 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + from pandas import ( CategoricalDtype, CategoricalIndex, @@ -192,6 +194,12 @@ def test_at_frame_raises_key_error2(self, indexer_al): with pytest.raises(KeyError, match="^0$"): indexer_al(df)["a", 0] + def test_at_frame_multiple_columns(self): + # GH#48296 - at shouldn't modify multiple columns + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + with pytest.raises(InvalidIndexError, match=r"slice\(None, None, None\)"): + df.at[5] = [6, 7] + def test_at_getitem_mixed_index_no_fallback(self): # GH#19860 ser = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) From 5987b63552236640eb8f2843fe96d8323e796ddd Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 1 Sep 2022 10:57:41 +0200 Subject: [PATCH 048/136] Backport PR #48336 on branch 1.5.x (DOC: Add whatsnew note for #45404) (#48341) Backport PR #48336: DOC: Add whatsnew note for #45404 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 75941a3d8edef..8671b73526f80 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1060,6 +1060,7 @@ Indexing - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - Bug in :meth:`loc.__setitem__` treating ``range`` keys as positional instead of label-based (:issue:`45479`) - Bug in :meth:`DataFrame.__setitem__` casting extension array dtypes to object when setting with a scalar key and :class:`DataFrame` as value (:issue:`46896`) +- Bug in :meth:`Series.__setitem__` when setting a scalar to a nullable pandas dtype would not raise a ``TypeError`` if the scalar could not be cast (losslessly) to the nullable type (:issue:`45404`) - Bug in :meth:`Series.__setitem__` when setting ``boolean`` dtype values containing ``NA`` incorrectly raising instead of casting to ``boolean`` dtype (:issue:`45462`) - Bug in :meth:`Series.loc` raising with boolean indexer containing ``NA`` when :class:`Index` did not match (:issue:`46551`) - Bug in :meth:`Series.__setitem__` where setting :attr:`NA` into a numeric-dtype :class:`Series` would incorrectly upcast to object-dtype rather than treating the value as ``np.nan`` (:issue:`44199`) From f0b0630e0c843caa55ce84e9b6c0234893fffddb Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Sep 2022 18:21:43 +0200 Subject: [PATCH 049/136] Backport PR #48254 on branch 1.5.x (REF: avoid FutureWarning about using deprecates loc.__setitem__ non-inplace usage) (#48353) Backport PR #48254: REF: avoid FutureWarning about using deprecates loc.__setitem__ non-inplace usage Co-authored-by: jbrockmendel --- pandas/core/generic.py | 50 +++++++++++++++++++---- pandas/tests/frame/methods/test_fillna.py | 25 +++++++++++- 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index abab32ae145bd..7b345a58bda88 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6869,14 +6869,48 @@ def fillna( if not is_dict else downcast.get(k) # type: ignore[union-attr] ) - # GH47649 - result.loc[:, k] = ( - result[k].fillna(v, limit=limit, downcast=downcast_k).values - ) - # TODO: result.loc[:, k] = result.loc[:, k].fillna( - # v, limit=limit, downcast=downcast_k - # ) - # Revert when GH45751 is fixed + + res_k = result[k].fillna(v, limit=limit, downcast=downcast_k) + + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k + else: + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = np.arange(self.shape[1])[locs] + elif ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "b" + ): + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc + else: + result.isetitem(loc, res_loc) + return result if not inplace else None elif not is_list_like(value): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 8355502c47c61..4cf6706707569 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -19,6 +19,30 @@ class TestFillNA: + @td.skip_array_manager_not_yet_implemented + def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): + df = DataFrame( + {"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]} + ) + df.columns = ["A", "A", "A"] + orig = df[:] + + df.fillna({"A": 2}, inplace=True) + # The first and third columns can be set inplace, while the second cannot. + + expected = DataFrame( + {"A": [2.0] * 3, "B": [2, Timestamp(1), 2], "C": [2, "foo", 2]} + ) + expected.columns = ["A", "A", "A"] + tm.assert_frame_equal(df, expected) + + # TODO: what's the expected/desired behavior with CoW? + if not using_copy_on_write: + assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0]) + assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1]) + if not using_copy_on_write: + assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2]) + @td.skip_array_manager_not_yet_implemented def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies @@ -287,7 +311,6 @@ def test_fillna_downcast_noop(self, frame_or_series): res3 = obj2.fillna("foo", downcast=np.dtype(np.int32)) tm.assert_equal(res3, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]]) def test_fillna_dictlike_value_duplicate_colnames(self, columns): # GH#43476 From e77e7c1b664862850450713280895ec0372179d1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Sep 2022 18:21:57 +0200 Subject: [PATCH 050/136] Backport PR #48334 on branch 1.5.x (BUG: read_html(extract_links=all) with no header) (#48350) Backport PR #48334: BUG: read_html(extract_links=all) with no header Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/html.py | 7 +++++-- pandas/tests/io/test_html.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index f890ad86519df..acf98a2f83921 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -34,6 +34,7 @@ from pandas import isna from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex from pandas.io.common import ( file_exists, @@ -1009,9 +1010,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, ** try: df = _data_to_frame(data=table, **kwargs) # Cast MultiIndex header to an Index of tuples when extracting header - # links and replace nan with None. + # links and replace nan with None (therefore can't use mi.to_flat_index()). # This maintains consistency of selection (e.g. df.columns.str[1]) - if extract_links in ("all", "header"): + if extract_links in ("all", "header") and isinstance( + df.columns, MultiIndex + ): df.columns = Index( ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), tupleize_cols=False, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 045c22f106105..de0d4c1b49ea5 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1416,3 +1416,18 @@ def test_extract_links_bad(self, spam_data): ) with pytest.raises(ValueError, match=msg): read_html(spam_data, extract_links="incorrect") + + def test_extract_links_all_no_header(self): + # GH 48316 + data = """ + + + + +
    + Google.com +
    + """ + result = self.read_html(data, extract_links="all")[0] + expected = DataFrame([[("Google.com", "https://google.com")]]) + tm.assert_frame_equal(result, expected) From 304cca101e40326fb5a8affada9d57262e79a271 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Sep 2022 23:11:55 +0200 Subject: [PATCH 051/136] Backport PR #48265 on branch 1.5.x (CI: Setting up ssh key to upload prod docs) (#48370) Backport PR #48265: CI: Setting up ssh key to upload prod docs Co-authored-by: Marc Garcia --- .github/workflows/docbuild-and-upload.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 5ad146eccb253..031e1ca054ba8 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -67,7 +67,7 @@ jobs: echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: github.event_name == 'push' && github.ref == 'refs/heads/main' + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) - name: Copy cheatsheets into site directory run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ From 74ee2d2eef44e21b49c6061b0bc5d3e1a57ce3e2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 5 Sep 2022 21:42:09 +0200 Subject: [PATCH 052/136] Backport PR #48381 on branch 1.5.x (CI: Pin mambaforge image) (#48401) Backport PR #48381: CI: Pin mambaforge image Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0bfb0e15d63fe..02c360d2f3d49 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/condaforge/mambaforge +FROM quay.io/condaforge/mambaforge:4.13.0-1 # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname From bccc06084908886e0c4e18ac6f1dcfc95e000e8b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 6 Sep 2022 22:40:21 +0200 Subject: [PATCH 053/136] Backport PR #48229 on branch 1.5.x (TST: Test Nullable int floordiv by 0) (#48413) Backport PR #48229: TST: Test Nullable int floordiv by 0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 2 ++ pandas/tests/arrays/integer/test_arithmetic.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8671b73526f80..c479c59082464 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1011,6 +1011,8 @@ Time Zones Numeric ^^^^^^^ - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`) +- Bug in arithmetic operations with nullable types without :attr:`NA` values not matching the same operation with non-nullable types (:issue:`48223`) +- Bug in ``floordiv`` when dividing by ``IntegerDtype`` ``0`` would return ``0`` instead of ``inf`` (:issue:`48223`) - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index e6a085ceb4d29..5b9780e390775 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -75,6 +75,21 @@ def test_floordiv(dtype): tm.assert_extension_array_equal(result, expected) +def test_floordiv_by_int_zero_no_mask(any_int_ea_dtype): + # GH 48223: Aligns with non-masked floordiv + # but differs from numpy + # https://github.com/pandas-dev/pandas/issues/30188#issuecomment-564452740 + ser = pd.Series([0, 1], dtype=any_int_ea_dtype) + result = 1 // ser + expected = pd.Series([np.inf, 1.0], dtype="Float64") + tm.assert_series_equal(result, expected) + + ser_non_nullable = ser.astype(ser.dtype.numpy_dtype) + result = 1 // ser_non_nullable + expected = expected.astype(np.float64) + tm.assert_series_equal(result, expected) + + def test_mod(dtype): a = pd.array([1, 2, 3, None, 5], dtype=dtype) b = pd.array([0, 1, None, 3, 4], dtype=dtype) From e6f502257bb721fc410a306d10169cf877bd7f39 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 6 Sep 2022 22:50:35 +0200 Subject: [PATCH 054/136] Backport PR #48414 on branch 1.5.x (DOC: Add deprecation to is_categorical) (#48418) Backport PR #48414: DOC: Add deprecation to is_categorical Co-authored-by: Kevin Sheppard --- pandas/core/dtypes/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1173703386491..f5262aa7ceeaa 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -280,6 +280,9 @@ def is_categorical(arr) -> bool: """ Check whether an array-like is a Categorical instance. + .. deprecated:: 1.1.0 + Use ``is_categorical_dtype`` instead. + Parameters ---------- arr : array-like From b98bdb713b94481a9e5f5b958cfeb9f96f4e5cf1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Sep 2022 09:13:52 +0200 Subject: [PATCH 055/136] Backport PR #48264 on branch 1.5.x (BUG: ArrowExtensionArray._from_* accepts pyarrow arrays) (#48422) * Backport PR #48264: BUG: ArrowExtensionArray._from_* accepts pyarrow arrays * Add missing import Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 20 ++++-- pandas/core/tools/times.py | 25 ++++---- pandas/tests/extension/test_arrow.py | 95 ++++++++++++++++++++++++++++ pandas/tests/tools/test_to_time.py | 7 +- 4 files changed, 127 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1f7939011a1f1..cfae5b4cae681 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -224,11 +224,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) - if isinstance(scalars, cls): - data = scalars._data + is_cls = isinstance(scalars, cls) + if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)): + if is_cls: + scalars = scalars._data if pa_dtype: - data = data.cast(pa_dtype) - return cls(data) + scalars = scalars.cast(pa_dtype) + return cls(scalars) else: return cls( pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) @@ -242,7 +244,10 @@ def _from_sequence_of_strings( Construct a new ExtensionArray from a sequence of strings. """ pa_type = to_pyarrow_type(dtype) - if pa.types.is_timestamp(pa_type): + if pa_type is None: + # Let pyarrow try to infer or raise + scalars = strings + elif pa.types.is_timestamp(pa_type): from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise") @@ -272,8 +277,9 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") else: - # Let pyarrow try to infer or raise - scalars = strings + raise NotImplementedError( + f"Converting strings to {pa_type} is not implemented." + ) return cls._from_sequence(scalars, dtype=pa_type, copy=copy) def __getitem__(self, item: PositionalIndexer): diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index 030cee3f678f4..87667921bf75a 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -80,17 +80,20 @@ def _convert_listlike(arg, format): format_found = False for element in arg: time_object = None - for time_format in formats: - try: - time_object = datetime.strptime(element, time_format).time() - if not format_found: - # Put the found format in front - fmt = formats.pop(formats.index(time_format)) - formats.insert(0, fmt) - format_found = True - break - except (ValueError, TypeError): - continue + try: + time_object = time.fromisoformat(element) + except (ValueError, TypeError): + for time_format in formats: + try: + time_object = datetime.strptime(element, time_format).time() + if not format_found: + # Put the found format in front + fmt = formats.pop(formats.index(time_format)) + formats.insert(0, fmt) + format_found = True + break + except (ValueError, TypeError): + continue if time_object is not None: times.append(time_object) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 43c52ef8848e2..9100b67edbe69 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -21,10 +21,13 @@ import pytest from pandas.compat import ( + is_ci_environment, + is_platform_windows, pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, pa_version_under6p0, + pa_version_under7p0, pa_version_under8p0, pa_version_under9p0, ) @@ -35,6 +38,8 @@ pa = pytest.importorskip("pyarrow", minversion="1.0.1") +from pandas.core.arrays.arrow.array import ArrowExtensionArray + from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip @@ -222,6 +227,96 @@ def test_from_dtype(self, data, request): ) super().test_from_dtype(data) + def test_from_sequence_pa_array(self, data, request): + # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 + # data._data = pa.ChunkedArray + if pa_version_under3p0: + request.node.add_marker( + pytest.mark.xfail( + reason="ChunkedArray has no attribute combine_chunks", + ) + ) + result = type(data)._from_sequence(data._data) + tm.assert_extension_array_equal(result, data) + assert isinstance(result._data, pa.ChunkedArray) + + result = type(data)._from_sequence(data._data.combine_chunks()) + tm.assert_extension_array_equal(result, data) + assert isinstance(result._data, pa.ChunkedArray) + + def test_from_sequence_pa_array_notimplemented(self, request): + if pa_version_under6p0: + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="month_day_nano_interval not implemented by pyarrow.", + ) + ) + with pytest.raises(NotImplementedError, match="Converting strings to"): + ArrowExtensionArray._from_sequence_of_strings( + ["12-1"], dtype=pa.month_day_nano_interval() + ) + + def test_from_sequence_of_strings_pa_array(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa_version_under3p0: + request.node.add_marker( + pytest.mark.xfail( + reason="ChunkedArray has no attribute combine_chunks", + ) + ) + elif pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"): + request.node.add_marker( + pytest.mark.xfail( + reason="Nanosecond time parsing not supported.", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support parsing {pa_dtype}", + ) + ) + elif pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.", + ) + ) + elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: + if pa_version_under7p0: + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support string cast from {pa_dtype}", + ) + ) + elif is_platform_windows() and is_ci_environment(): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + ) + elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support string cast from {pa_dtype}", + ) + ) + pa_array = data._data.cast(pa.string()) + result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) + tm.assert_extension_array_equal(result, data) + + pa_array = pa_array.combine_chunks() + result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) + tm.assert_extension_array_equal(result, data) + @pytest.mark.xfail( raises=NotImplementedError, reason="pyarrow.ChunkedArray backing is 1D." diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py index a8316e0f3970c..c80b1e080a1d1 100644 --- a/pandas/tests/tools/test_to_time.py +++ b/pandas/tests/tools/test_to_time.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY311 + from pandas import Series import pandas._testing as tm from pandas.core.tools.datetimes import to_time as to_time_alias @@ -40,8 +42,9 @@ def test_parsers_time(self, time_string): def test_odd_format(self): new_string = "14.15" msg = r"Cannot convert arg \['14\.15'\] to a time" - with pytest.raises(ValueError, match=msg): - to_time(new_string) + if not PY311: + with pytest.raises(ValueError, match=msg): + to_time(new_string) assert to_time(new_string, format="%H.%M") == time(14, 15) def test_arraylike(self): From 5616243c2f20936317fa32058d1fd7a2f9d20e08 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:39:51 +0200 Subject: [PATCH 056/136] Backport PR #48411 on branch 1.5.x (REGR: get_loc for ExtensionEngine not returning bool indexer for na) (#48430) Backport PR #48411: REGR: get_loc for ExtensionEngine not returning bool indexer for na Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/index.pyx | 2 +- pandas/tests/indexes/test_indexing.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0cf7c4d45c634..617760c2981c4 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1061,7 +1061,7 @@ cdef class ExtensionEngine(SharedEngine): cdef ndarray _get_bool_indexer(self, val): if checknull(val): - return self.values.isna().view("uint8") + return self.values.isna() try: return self.values == val diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 739039241a31d..ab934df992d61 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -20,6 +20,7 @@ from pandas.errors import InvalidIndexError from pandas import ( + NA, DatetimeIndex, Index, IntervalIndex, @@ -221,6 +222,13 @@ def test_get_loc_generator(self, index): # MultiIndex specifically checks for generator; others for scalar index.get_loc(x for x in range(5)) + def test_get_loc_masked_duplicated_na(self): + # GH#48411 + idx = Index([1, 2, NA, NA], dtype="Int64") + result = idx.get_loc(NA) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexer: def test_get_indexer_base(self, index): @@ -253,6 +261,13 @@ def test_get_indexer_consistency(self, index): assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp + def test_get_indexer_masked_duplicated_na(self): + # GH#48411 + idx = Index([1, 2, NA, NA], dtype="Int64") + result = idx.get_indexer_for(Index([1, NA], dtype="Int64")) + expected = np.array([0, 2, 3], dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + class TestConvertSliceIndexer: def test_convert_almost_null_slice(self, index): From c5a598ab30d9b2d9c7a20af4efe973d18a302945 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Sep 2022 22:46:06 +0200 Subject: [PATCH 057/136] Backport PR #48419 on branch 1.5.x (BUG: ensure to return writable buffer in __dataframe__ interchange for categorical column) (#48441) Backport PR #48419: BUG: ensure to return writable buffer in __dataframe__ interchange for categorical column Co-authored-by: Joris Van den Bossche --- pandas/core/interchange/column.py | 2 +- pandas/tests/interchange/test_impl.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 83f57d5bb8d3e..c9bafbfaad2d2 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -270,7 +270,7 @@ def _get_data_buffer( buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: - codes = self._col.values.codes + codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == DtypeKind.STRING: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index b4c27ba31317b..2abe975ebcc12 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -5,6 +5,7 @@ import pytest from pandas._libs.tslibs import iNaT +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -193,3 +194,13 @@ def test_datetime(): assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT) tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) + + +@td.skip_if_np_lt("1.23") +def test_categorical_to_numpy_dlpack(): + # https://github.com/pandas-dev/pandas/issues/48393 + df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) + col = df.__dataframe__().get_column_by_name("A") + result = np.from_dlpack(col.get_buffers()["data"][0]) + expected = np.array([0, 1, 0], dtype="int8") + tm.assert_numpy_array_equal(result, expected) From 3fd62c450585955dc6fbad9e29b65cda4e24b025 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Sep 2022 09:12:36 +0200 Subject: [PATCH 058/136] Backport PR #48444 on branch 1.5.x (CI: Pin ipython version) (#48449) Backport PR #48444: CI: Pin ipython version Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/timeseries.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index ed7688f229ca8..474068e43a4d4 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1981,7 +1981,6 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` p = pd.Period("2012-01", freq="2M") p + 2 p - 1 - @okexcept p == pd.Period("2012-01", freq="3M") From 4226f42b841c147074fbedf1d66fdf22b7927bdc Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Sep 2022 09:12:57 +0200 Subject: [PATCH 059/136] Backport PR #48380 on branch 1.5.x (DOC: Clarify that objects dtype takes precedence in where) (#48445) * Backport PR #48380: DOC: Clarify that objects dtype takes precedence in where * Update generic.py Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/generic.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c479c59082464..af0ba1bd75124 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -308,6 +308,7 @@ Other enhancements - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) - :class:`Series` and :class:`DataFrame` with :class:`IntegerDtype` now supports bitwise operations (:issue:`34463`) - Add ``milliseconds`` field support for :class:`.DateOffset` (:issue:`43371`) +- :meth:`DataFrame.where` tries to maintain dtype of :class:`DataFrame` if fill value can be cast without loss of precision (:issue:`45582`) - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :func:`concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :func:`concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b345a58bda88..20c5592071e2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9876,6 +9876,9 @@ def where( For further details and examples see the ``{name}`` documentation in :ref:`indexing `. + The dtype of the object takes precedence. The fill value is casted to + the object's dtype, if this can be done losslessly. + Examples -------- >>> s = pd.Series(range(5)) From 045e9e4d938c22e869f357dd34ccee93ef6c1784 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Sep 2022 04:25:35 -0700 Subject: [PATCH 060/136] Manual Backport PR #48427 on branch 1.5.x (BLD: Refactor Dockerfile to not install dev enviornment on base) (#48450) Backport PR #48427: BLD: Refactor Dockerfile to not install dev enviornment on base --- .github/workflows/code-checks.yml | 3 +++ Dockerfile | 22 +++++-------------- .../development/contributing_environment.rst | 10 +++++++++ doc/source/whatsnew/v1.5.0.rst | 1 + 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index c9c5058fb365c..6aff77c708378 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -153,6 +153,9 @@ jobs: - name: Build image run: docker build --pull --no-cache --tag pandas-dev-env . + - name: Show environment + run: docker run -w /home/pandas pandas-dev-env mamba run -n pandas-dev python -c "import pandas as pd; print(pd.show_versions())" + requirements-dev-text-installable: name: Test install requirements-dev.txt runs-on: ubuntu-latest diff --git a/Dockerfile b/Dockerfile index 02c360d2f3d49..9de8695b24274 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/condaforge/mambaforge:4.13.0-1 +FROM quay.io/condaforge/mambaforge # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname @@ -10,16 +10,12 @@ ENV DEBIAN_FRONTEND=noninteractive # Configure apt and install packages RUN apt-get update \ - && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ + && apt-get -y install --no-install-recommends apt-utils git tzdata dialog 2>&1 \ # - # Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime") - && apt-get -y install tzdata \ + # Configure timezone (fix for tests which try to read from "/etc/localtime") && ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \ && dpkg-reconfigure -f noninteractive tzdata \ # - # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed - && apt-get -y install git iproute2 procps iproute2 lsb-release \ - # # cleanup && apt-get autoremove -y \ && apt-get clean -y \ @@ -35,18 +31,12 @@ RUN mkdir "$pandas_home" \ && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ && git pull upstream main -# Because it is surprisingly difficult to activate a conda environment inside a DockerFile -# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), -# we just update the base/root one from the 'environment.yml' file instead of creating a new one. -# # Set up environment -RUN mamba env update -n base -f "$pandas_home/environment.yml" +RUN mamba env create -f "$pandas_home/environment.yml" # Build C extensions and pandas -SHELL ["/bin/bash", "-c"] -RUN . /opt/conda/etc/profile.d/conda.sh \ - && conda activate base \ - && cd "$pandas_home" \ +SHELL ["mamba", "run", "--no-capture-output", "-n", "pandas-dev", "/bin/bash", "-c"] +RUN cd "$pandas_home" \ && export \ && python setup.py build_ext -j 4 \ && python -m pip install --no-build-isolation -e . diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index c881770aa7584..9e6da887671bd 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -38,6 +38,16 @@ Run Container:: # Run a container and bind your local repo to the container docker run -it -w /home/pandas --rm -v path-to-local-pandas-repo:/home/pandas pandas-yourname-env +Then a ``pandas-dev`` virtual environment will be available with all the development dependencies. + +.. code-block:: shell + + root@... :/home/pandas# conda env list + # conda environments: + # + base * /opt/conda + pandas-dev /opt/conda/envs/pandas-dev + .. note:: If you bind your local repo for the first time, you have to build the C extensions afterwards. Run the following command inside the container:: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index af0ba1bd75124..d8a319da2065e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -332,6 +332,7 @@ Other enhancements - Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`) - :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`) - The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`) +- The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From 0f8bece70ecefd7878f948ed8970dd0d4d100834 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Sep 2022 20:28:55 +0200 Subject: [PATCH 061/136] Backport PR #48426 on branch 1.5.x (BUG: Column.size should be a method) (#48465) Backport PR #48426: BUG: Column.size should be a method Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/interchange/column.py | 1 - pandas/tests/interchange/test_impl.py | 6 +++--- pandas/tests/interchange/test_spec_conformance.py | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index c9bafbfaad2d2..dc24c928d1f39 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -81,7 +81,6 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: self._col = column self._allow_copy = allow_copy - @property def size(self) -> int: """ Size of the column, in elements. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 2abe975ebcc12..078a17510d502 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -164,14 +164,14 @@ def test_string(): df = pd.DataFrame({"A": test_str_data}) col = df.__dataframe__().get_column_by_name("A") - assert col.size == 6 + assert col.size() == 6 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) df_sliced = df[1:] col = df_sliced.__dataframe__().get_column_by_name("A") - assert col.size == 5 + assert col.size() == 5 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) @@ -188,7 +188,7 @@ def test_datetime(): df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) col = df.__dataframe__().get_column_by_name("A") - assert col.size == 2 + assert col.size() == 2 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.DATETIME assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT) diff --git a/pandas/tests/interchange/test_spec_conformance.py b/pandas/tests/interchange/test_spec_conformance.py index 392402871a5fd..965938b111e86 100644 --- a/pandas/tests/interchange/test_spec_conformance.py +++ b/pandas/tests/interchange/test_spec_conformance.py @@ -27,7 +27,7 @@ def test_only_one_dtype(test_data, df_from_dict): null_count = dfX.get_column_by_name(column).null_count assert null_count == 0 assert isinstance(null_count, int) - assert dfX.get_column_by_name(column).size == column_size + assert dfX.get_column_by_name(column).size() == column_size assert dfX.get_column_by_name(column).offset == 0 @@ -52,7 +52,7 @@ def test_mixed_dtypes(df_from_dict): colX = dfX.get_column_by_name(column) assert colX.null_count == 0 assert isinstance(colX.null_count, int) - assert colX.size == 3 + assert colX.size() == 3 assert colX.offset == 0 assert colX.dtype[0] == kind @@ -118,14 +118,14 @@ def test_column_get_chunks(size, n_chunks, df_from_dict): dfX = df.__dataframe__() chunks = list(dfX.get_column(0).get_chunks(n_chunks)) assert len(chunks) == n_chunks - assert sum(chunk.size for chunk in chunks) == size + assert sum(chunk.size() for chunk in chunks) == size def test_get_columns(df_from_dict): df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) dfX = df.__dataframe__() for colX in dfX.get_columns(): - assert colX.size == 2 + assert colX.size() == 2 assert colX.num_chunks() == 1 # for meanings of dtype[0] see the spec; we cannot import the spec here as this # file is expected to be vendored *anywhere* From ba3494ce11c3296f3bae0eabbcdff0b1d401b35e Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Sep 2022 23:13:58 +0200 Subject: [PATCH 062/136] Backport PR #48398 on branch 1.5.x (WARN: Avoid FutureWarnings in tests) (#48420) * Backport PR #48398: WARN: Avoid FutureWarnings in tests * Update Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Co-authored-by: Marc Garcia --- pandas/tests/frame/methods/test_fillna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 4cf6706707569..3d7e5c6823e9d 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -777,6 +777,6 @@ def test_fillna_nonconsolidated_frame(): ], columns=["i1", "i2", "i3", "f1"], ) - df_nonconsol = df.pivot("i1", "i2") + df_nonconsol = df.pivot(index="i1", columns="i2") result = df_nonconsol.fillna(0) assert result.isna().sum().sum() == 0 From 5eedd7512eb7cc524e1c6dc6c9e093835c30d2d2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Sep 2022 23:14:12 +0200 Subject: [PATCH 063/136] Backport PR #48416 on branch 1.5.x (REF: ensure to apply suffixes before concat step in merge code) (#48470) Backport PR #48416: REF: ensure to apply suffixes before concat step in merge code Co-authored-by: Joris Van den Bossche --- pandas/core/reshape/merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 159ab33a8a04f..fe51230961327 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -764,8 +764,9 @@ def _reindex_and_concat( from pandas import concat + left.columns = llabels + right.columns = rlabels result = concat([left, right], axis=1, copy=copy) - result.columns = llabels.append(rlabels) return result def get_result(self, copy: bool = True) -> DataFrame: From 54c1aa26bc0618bf03d4d9b4a90c3dc426c44954 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 9 Sep 2022 11:40:09 +0200 Subject: [PATCH 064/136] Backport PR #48354 on branch 1.5.x (CI: Bump timeout to 180 minutes) (#48474) Backport PR #48354: CI: Bump timeout to 180 minutes Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/macos-windows.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 9cbd41917110e..8b3d69943bd9d 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -28,7 +28,7 @@ jobs: defaults: run: shell: bash -el {0} - timeout-minutes: 120 + timeout-minutes: 180 strategy: matrix: os: [macos-latest, windows-latest] diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 301e7804ddbd8..b7cddc6bb3d05 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -26,7 +26,7 @@ jobs: defaults: run: shell: bash -el {0} - timeout-minutes: 120 + timeout-minutes: 180 strategy: matrix: env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] From fca63e9b40e8bf949b5883f163f62ef50cb204e6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 9 Sep 2022 22:42:22 +0200 Subject: [PATCH 065/136] Backport PR #48472 on branch 1.5.x (PERF: keep using ObjectEngine for ExtensionArrays for 1.5) (#48486) Backport PR #48472: PERF: keep using ObjectEngine for ExtensionArrays for 1.5 Co-authored-by: Joris Van den Bossche --- pandas/core/indexes/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 91c658a4cef5d..ace3df3ae97a4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5154,6 +5154,9 @@ def _get_engine_target(self) -> ArrayLike: if isinstance(vals, StringArray): # GH#45652 much more performant than ExtensionEngine return vals._ndarray + if type(self) is Index and isinstance(self._values, ExtensionArray): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) return vals def _from_join_target(self, result: np.ndarray) -> ArrayLike: From ad087f506a07e8504f8b86928753b169a05edc69 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 11 Sep 2022 13:37:30 +0200 Subject: [PATCH 066/136] Backport PR #48473 on branch 1.5.x (REGR: .describe on unsigned dtypes results in object) (#48501) Backport PR #48473: REGR: .describe on unsigned dtypes results in object Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/conftest.py | 39 ++++++++++++++++++++ pandas/core/describe.py | 5 ++- pandas/tests/series/methods/test_describe.py | 22 +++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b6e1559b9f8cf..cf735bf535ddd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1603,6 +1603,45 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture( + params=tm.ALL_REAL_NUMPY_DTYPES + + tm.COMPLEX_DTYPES + + tm.ALL_INT_EA_DTYPES + + tm.FLOAT_EA_DTYPES +) +def any_numeric_dtype(request): + """ + Parameterized fixture for all numeric dtypes. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + * 'Float32' + * 'Float64' + """ + return request.param + + # categoricals are handled separately _any_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), diff --git a/pandas/core/describe.py b/pandas/core/describe.py index d265a307078b9..d6546b06ec711 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, + is_complex_dtype, is_datetime64_any_dtype, is_numeric_dtype, is_timedelta64_dtype, @@ -240,7 +241,9 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + series.quantile(percentiles).tolist() + [series.max()] ) - return Series(d, index=stat_index, name=series.name) + # GH#48340 - always return float on non-complex numeric data + dtype = float if is_numeric_dtype(series) and not is_complex_dtype(series) else None + return Series(d, index=stat_index, name=series.name, dtype=dtype) def describe_categorical_1d( diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index e6c6016d2b3a1..d7650e2768781 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,5 +1,7 @@ import numpy as np +from pandas.core.dtypes.common import is_complex_dtype + from pandas import ( Period, Series, @@ -149,3 +151,23 @@ def test_datetime_is_numeric_includes_datetime(self): index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) + + def test_numeric_result_dtype(self, any_numeric_dtype): + # GH#48340 - describe should always return float on non-complex numeric input + ser = Series([0, 1], dtype=any_numeric_dtype) + result = ser.describe() + expected = Series( + [ + 2.0, + 0.5, + ser.std(), + 0, + 0.25, + 0.5, + 0.75, + 1.0, + ], + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype="complex128" if is_complex_dtype(ser) else None, + ) + tm.assert_series_equal(result, expected) From e6a014ffa5d73bdc2de2824d3d8be6b99748f432 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 12 Sep 2022 19:13:51 +0200 Subject: [PATCH 067/136] Backport PR #48443 on branch 1.5.x (BUG: Fix pyarrow groupby tests) (#48494) * BUG: Fix pyarrow groupby tests (#48443) # Conflicts: # pandas/tests/extension/test_arrow.py * CI: Fix failing tests (#48493) Co-authored-by: jbrockmendel --- pandas/core/series.py | 5 ++- pandas/tests/extension/test_arrow.py | 60 +++++----------------------- 2 files changed, 15 insertions(+), 50 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index d2f66e9bd36e2..0e6f40564c003 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -572,7 +572,10 @@ def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: """ labels = ensure_index(labels) - if labels._is_all_dates: + if labels._is_all_dates and not ( + type(labels) is Index and not isinstance(labels.dtype, np.dtype) + ): + # exclude e.g. timestamp[ns][pyarrow] dtype from this casting deep_labels = labels if isinstance(labels, CategoricalIndex): deep_labels = labels.categories diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9100b67edbe69..53d59c78b40cc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ pa_version_under8p0, pa_version_under9p0, ) +from pandas.errors import PerformanceWarning import pandas as pd import pandas._testing as tm @@ -515,15 +516,6 @@ def test_groupby_extension_no_sort(self, data_for_grouping, request): reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) super().test_groupby_extension_no_sort(data_for_grouping) def test_groupby_extension_transform(self, data_for_grouping, request): @@ -547,8 +539,7 @@ def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - # Is there a better way to get the "series" ID for groupby_apply_op? - is_series = "series" in request.node.nodeid + # TODO: Is there a better way to get the "object" ID for groupby_apply_op? is_object = "object" in request.node.nodeid if pa.types.is_duration(pa_dtype): request.node.add_marker( @@ -567,14 +558,10 @@ def test_groupby_extension_apply( reason="GH 47514: _concat_datetime expects axis arg.", ) ) - elif not is_series: - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + with tm.maybe_produces_warning( + PerformanceWarning, pa_version_under7p0, check_stacklevel=False + ): + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) def test_in_numeric_groupby(self, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype @@ -603,17 +590,10 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif as_index is True and ( - pa.types.is_date(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + with tm.maybe_produces_warning( + PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_groupby_extension_agg(as_index, data_for_grouping) + super().test_groupby_extension_agg(as_index, data_for_grouping) class TestBaseDtype(base.BaseDtypeTests): @@ -1443,16 +1423,7 @@ def test_diff(self, data, periods, request): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna, request): pa_dtype = all_data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - elif pa.types.is_duration(pa_dtype): + if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -1463,16 +1434,7 @@ def test_value_counts(self, all_data, dropna, request): def test_value_counts_with_normalize(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - elif pa.types.is_duration(pa_dtype): + if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, From 2a510620ee613901b40bc25f47a68e37426e95db Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Sep 2022 21:14:39 +0200 Subject: [PATCH 068/136] Backport PR #48490 on branch 1.5.x (CI: Use -j1 for python-dev build to avoid flaky build error) (#48517) Backport PR #48490: CI: Use -j1 for python-dev build to avoid flaky build error Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/python-dev.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 46cb564b494f6..8b97c3821efe5 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -80,9 +80,10 @@ jobs: python -m pip install python-dateutil pytz cython hypothesis==6.52.1 pytest>=6.2.5 pytest-xdist pytest-cov pytest-asyncio>=0.17 python -m pip list + # GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs - name: Build Pandas run: | - python setup.py build_ext -q -j2 + python setup.py build_ext -q -j1 python -m pip install -e . --no-build-isolation --no-use-pep517 - name: Build Version From b6e5132efba31747cde3ca193d8f039ff9811553 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 10:23:49 -0700 Subject: [PATCH 069/136] Manual Backport PR #48428 on branch 1.5.x (BUG/TST: fix a bunch of arraymanager+pyarrow tests) (#48518) Backport PR #48428: BUG/TST: fix a bunch of arraymanager+pyarrow tests Co-authored-by: jbrockmendel --- pandas/core/internals/array_manager.py | 17 +- pandas/tests/extension/test_arrow.py | 347 ++----------------------- 2 files changed, 30 insertions(+), 334 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index dcf69dfda1ae8..fd156ccfc8b31 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -297,19 +297,10 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T if obj.ndim == 2: kwargs[k] = obj[[i]] - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no - # attribute "tz" - if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] - # DatetimeArray needs to be converted to ndarray for DatetimeLikeBlock - - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no - # attribute "_data" - arr = arr._data # type: ignore[union-attr] - elif arr.dtype.kind == "m" and not isinstance(arr, np.ndarray): - # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock - - # error: "ExtensionArray" has no attribute "_data" - arr = arr._data # type: ignore[attr-defined] + if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray): + # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to + # convert for the Block constructors. + arr = np.asarray(arr) if self.ndim == 2: arr = ensure_block_shape(arr, 2) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 53d59c78b40cc..a5da960427fe7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -682,62 +682,9 @@ def test_view(self, data): class TestBaseMissing(base.BaseMissingTests): - def test_fillna_limit_pad(self, data_missing, using_array_manager, request): - if using_array_manager and pa.types.is_duration( - data_missing.dtype.pyarrow_dtype - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_fillna_limit_pad(data_missing) - - def test_fillna_limit_backfill(self, data_missing, using_array_manager, request): - if using_array_manager and pa.types.is_duration( - data_missing.dtype.pyarrow_dtype - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_fillna_limit_backfill(data_missing) - - def test_fillna_series(self, data_missing, using_array_manager, request): - if using_array_manager and pa.types.is_duration( - data_missing.dtype.pyarrow_dtype - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_fillna_series(data_missing) - - def test_fillna_series_method( - self, data_missing, fillna_method, using_array_manager, request - ): - if using_array_manager and pa.types.is_duration( - data_missing.dtype.pyarrow_dtype - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_fillna_series_method(data_missing, fillna_method) - - def test_fillna_frame(self, data_missing, using_array_manager, request): - if using_array_manager and pa.types.is_duration( - data_missing.dtype.pyarrow_dtype - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_fillna_frame(data_missing) + @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") + def test_dropna_array(self, data_missing): + super().test_dropna_array(data_missing) class TestBasePrinting(base.BasePrintingTests): @@ -947,7 +894,7 @@ def test_setitem_scalar_series(self, data, box_in_series, request): ) super().test_setitem_scalar_series(data, box_in_series) - def test_setitem_sequence(self, data, box_in_series, using_array_manager, request): + def test_setitem_sequence(self, data, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -955,47 +902,9 @@ def test_setitem_sequence(self, data, box_in_series, using_array_manager, reques reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_sequence(data, box_in_series) - def test_setitem_sequence_mismatched_length_raises( - self, data, as_array, using_array_manager, request - ): - if using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_setitem_sequence_mismatched_length_raises(data, as_array) - - def test_setitem_empty_indexer( - self, data, box_in_series, using_array_manager, request - ): - if ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) - super().test_setitem_empty_indexer(data, box_in_series) - - def test_setitem_sequence_broadcasts( - self, data, box_in_series, using_array_manager, request - ): + def test_setitem_sequence_broadcasts(self, data, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1003,20 +912,10 @@ def test_setitem_sequence_broadcasts( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_sequence_broadcasts(data, box_in_series) @pytest.mark.parametrize("setter", ["loc", "iloc"]) - def test_setitem_scalar(self, data, setter, using_array_manager, request): + def test_setitem_scalar(self, data, setter, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1024,15 +923,9 @@ def test_setitem_scalar(self, data, setter, using_array_manager, request): reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_scalar(data, setter) - def test_setitem_loc_scalar_mixed(self, data, using_array_manager, request): + def test_setitem_loc_scalar_mixed(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1040,15 +933,9 @@ def test_setitem_loc_scalar_mixed(self, data, using_array_manager, request): reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_loc_scalar_mixed(data) - def test_setitem_loc_scalar_single(self, data, using_array_manager, request): + def test_setitem_loc_scalar_single(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1056,17 +943,9 @@ def test_setitem_loc_scalar_single(self, data, using_array_manager, request): reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_loc_scalar_single(data) - def test_setitem_loc_scalar_multiple_homogoneous( - self, data, using_array_manager, request - ): + def test_setitem_loc_scalar_multiple_homogoneous(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1074,15 +953,9 @@ def test_setitem_loc_scalar_multiple_homogoneous( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_loc_scalar_multiple_homogoneous(data) - def test_setitem_iloc_scalar_mixed(self, data, using_array_manager, request): + def test_setitem_iloc_scalar_mixed(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1090,15 +963,9 @@ def test_setitem_iloc_scalar_mixed(self, data, using_array_manager, request): reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_iloc_scalar_mixed(data) - def test_setitem_iloc_scalar_single(self, data, using_array_manager, request): + def test_setitem_iloc_scalar_single(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1106,17 +973,9 @@ def test_setitem_iloc_scalar_single(self, data, using_array_manager, request): reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_iloc_scalar_single(data) - def test_setitem_iloc_scalar_multiple_homogoneous( - self, data, using_array_manager, request - ): + def test_setitem_iloc_scalar_multiple_homogoneous(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1124,12 +983,6 @@ def test_setitem_iloc_scalar_multiple_homogoneous( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_iloc_scalar_multiple_homogoneous(data) @pytest.mark.parametrize( @@ -1141,9 +994,7 @@ def test_setitem_iloc_scalar_multiple_homogoneous( ], ids=["numpy-array", "boolean-array", "boolean-array-na"], ) - def test_setitem_mask( - self, data, mask, box_in_series, using_array_manager, request - ): + def test_setitem_mask(self, data, mask, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1151,21 +1002,9 @@ def test_setitem_mask( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_mask(data, mask, box_in_series) - def test_setitem_mask_boolean_array_with_na( - self, data, box_in_series, using_array_manager, request - ): + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) unit = getattr(data.dtype.pyarrow_dtype, "unit", None) if pa_version_under2p0 and tz not in (None, "UTC") and unit == "us": @@ -1174,16 +1013,6 @@ def test_setitem_mask_boolean_array_with_na( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_mask_boolean_array_with_na(data, box_in_series) @pytest.mark.parametrize( @@ -1191,9 +1020,7 @@ def test_setitem_mask_boolean_array_with_na( [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], ids=["list", "integer-array", "numpy-array"], ) - def test_setitem_integer_array( - self, data, idx, box_in_series, using_array_manager, request - ): + def test_setitem_integer_array(self, data, idx, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1201,23 +1028,11 @@ def test_setitem_integer_array( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_integer_array(data, idx, box_in_series) @pytest.mark.parametrize("as_callable", [True, False]) @pytest.mark.parametrize("setter", ["loc", None]) - def test_setitem_mask_aligned( - self, data, as_callable, setter, using_array_manager, request - ): + def test_setitem_mask_aligned(self, data, as_callable, setter, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1225,16 +1040,10 @@ def test_setitem_mask_aligned( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_mask_aligned(data, as_callable, setter) @pytest.mark.parametrize("setter", ["loc", None]) - def test_setitem_mask_broadcast(self, data, setter, using_array_manager, request): + def test_setitem_mask_broadcast(self, data, setter, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1242,12 +1051,6 @@ def test_setitem_mask_broadcast(self, data, setter, using_array_manager, request reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_mask_broadcast(data, setter) def test_setitem_tuple_index(self, data, request): @@ -1260,7 +1063,7 @@ def test_setitem_tuple_index(self, data, request): ) super().test_setitem_tuple_index(data) - def test_setitem_slice(self, data, box_in_series, using_array_manager, request): + def test_setitem_slice(self, data, box_in_series, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1268,19 +1071,9 @@ def test_setitem_slice(self, data, box_in_series, using_array_manager, request): reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and box_in_series - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_slice(data, box_in_series) - def test_setitem_loc_iloc_slice(self, data, using_array_manager, request): + def test_setitem_loc_iloc_slice(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1288,12 +1081,6 @@ def test_setitem_loc_iloc_slice(self, data, using_array_manager, request): reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_loc_iloc_slice(data) def test_setitem_slice_array(self, data, request): @@ -1306,9 +1093,7 @@ def test_setitem_slice_array(self, data, request): ) super().test_setitem_slice_array(data) - def test_setitem_with_expansion_dataframe_column( - self, data, full_indexer, using_array_manager, request - ): + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): # Is there a better way to get the full_indexer id "null_slice"? is_null_slice = "null_slice" in request.node.nodeid tz = getattr(data.dtype.pyarrow_dtype, "tz", None) @@ -1318,21 +1103,9 @@ def test_setitem_with_expansion_dataframe_column( reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" ) ) - elif ( - using_array_manager - and pa.types.is_duration(data.dtype.pyarrow_dtype) - and not is_null_slice - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_with_expansion_dataframe_column(data, full_indexer) - def test_setitem_with_expansion_row( - self, data, na_value, using_array_manager, request - ): + def test_setitem_with_expansion_row(self, data, na_value, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1340,15 +1113,9 @@ def test_setitem_with_expansion_row( reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_with_expansion_row(data, na_value) - def test_setitem_frame_2d_values(self, data, using_array_manager, request): + def test_setitem_frame_2d_values(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( @@ -1356,12 +1123,6 @@ def test_setitem_frame_2d_values(self, data, using_array_manager, request): reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" ) ) - elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="Checking ndim when using arraymanager with duration type" - ) - ) super().test_setitem_frame_2d_values(data) @pytest.mark.xfail(reason="GH 45419: pyarrow.ChunkedArray does not support views") @@ -1586,26 +1347,6 @@ def test_factorize_empty(self, data, request): ) super().test_factorize_empty(data) - def test_fillna_copy_frame(self, data_missing, request, using_array_manager): - pa_dtype = data_missing.dtype.pyarrow_dtype - if using_array_manager and pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"Checking ndim when using arraymanager with {pa_dtype}" - ) - ) - super().test_fillna_copy_frame(data_missing) - - def test_fillna_copy_series(self, data_missing, request, using_array_manager): - pa_dtype = data_missing.dtype.pyarrow_dtype - if using_array_manager and pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"Checking ndim when using arraymanager with {pa_dtype}" - ) - ) - super().test_fillna_copy_series(data_missing) - def test_shift_fill_value(self, data, request): pa_dtype = data.dtype.pyarrow_dtype tz = getattr(pa_dtype, "tz", None) @@ -1643,16 +1384,10 @@ def test_insert(self, data, request): ) super().test_insert(data) - def test_combine_first(self, data, request, using_array_manager): + def test_combine_first(self, data, request): pa_dtype = data.dtype.pyarrow_dtype tz = getattr(pa_dtype, "tz", None) - if using_array_manager and pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"Checking ndim when using arraymanager with {pa_dtype}" - ) - ) - elif pa_version_under2p0 and tz not in (None, "UTC"): + if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( pytest.mark.xfail( reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" @@ -1660,30 +1395,6 @@ def test_combine_first(self, data, request, using_array_manager): ) super().test_combine_first(data) - @pytest.mark.parametrize("frame", [True, False]) - @pytest.mark.parametrize( - "periods, indices", - [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], - ) - def test_container_shift( - self, data, frame, periods, indices, request, using_array_manager - ): - pa_dtype = data.dtype.pyarrow_dtype - if ( - using_array_manager - and pa.types.is_duration(pa_dtype) - and periods in (-2, 2) - ): - request.node.add_marker( - pytest.mark.xfail( - reason=( - f"Checking ndim when using arraymanager with " - f"{pa_dtype} and periods={periods}" - ) - ) - ) - super().test_container_shift(data, frame, periods, indices) - @pytest.mark.xfail( reason="result dtype pyarrow[bool] better than expected dtype object" ) @@ -1711,15 +1422,9 @@ def test_searchsorted(self, data_for_sorting, as_series, request): ) super().test_searchsorted(data_for_sorting, as_series) - def test_where_series(self, data, na_value, as_frame, request, using_array_manager): + def test_where_series(self, data, na_value, as_frame, request): pa_dtype = data.dtype.pyarrow_dtype - if using_array_manager and pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"Checking ndim when using arraymanager with {pa_dtype}" - ) - ) - elif pa.types.is_temporal(pa_dtype): + if pa.types.is_temporal(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, From 5817209969ef9e3211e4c7acfaadbcc7c1a67a22 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 13 Sep 2022 20:07:01 +0200 Subject: [PATCH 070/136] Backport PR #48525 on branch 1.5.x (CI: Fix py311 builds different exception message) (#48529) Backport PR #48525: CI: Fix py311 builds different exception message Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/dtypes/test_concat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index f624c56b54001..772dfdfe8fb03 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -38,7 +38,9 @@ def test_concat_periodarray_2d(): result = _concat.concat_compat([arr[:, :2], arr[:, 2:]], axis=1) tm.assert_period_array_equal(result, arr) - msg = "all the input array dimensions for the concatenation axis must match exactly" + msg = ( + "all the input array dimensions.* for the concatenation axis must match exactly" + ) with pytest.raises(ValueError, match=msg): _concat.concat_compat([arr[:, :2], arr[:, 2:]], axis=0) From ecc8ab4aa12ce96103dcdbe6b30c7d1791b8438b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 14 Sep 2022 00:41:38 +0200 Subject: [PATCH 071/136] Backport PR #48489 on branch 1.5.x (BUG: fix test_arrow.py tests) (#48532) Backport PR #48489: BUG: fix test_arrow.py tests Co-authored-by: jbrockmendel --- pandas/core/dtypes/common.py | 8 +- pandas/core/dtypes/concat.py | 13 ++- pandas/core/internals/concat.py | 19 ++--- pandas/core/reshape/merge.py | 13 +-- pandas/io/formats/format.py | 4 +- pandas/tests/extension/test_arrow.py | 120 +-------------------------- 6 files changed, 29 insertions(+), 148 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f5262aa7ceeaa..aeb9bc7a31674 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1188,10 +1188,10 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False - if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): - # fastpath - dtype = arr_or_dtype - return dtype.kind in ["m", "M"] or dtype.type is Period + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype.kind in ["m", "M"] + elif isinstance(arr_or_dtype, ExtensionDtype): + return isinstance(arr_or_dtype, (PeriodDtype, DatetimeTZDtype)) try: dtype = get_dtype(arr_or_dtype) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 80efe96ae7146..28e1498c5906c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -27,7 +27,10 @@ is_dtype_equal, is_sparse, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCExtensionArray, @@ -103,10 +106,12 @@ def is_nonempty(x) -> bool: # ea_compat_axis see GH#39574 to_concat = non_empties + dtypes = {obj.dtype for obj in to_concat} kinds = {obj.dtype.kind for obj in to_concat} - contains_datetime = any(kind in ["m", "M"] for kind in kinds) or any( - isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat - ) + contains_datetime = any( + isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"] + for dtype in dtypes + ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 0df8aa5a055b0..dafc437d5880e 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,7 +29,6 @@ ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_scalar, needs_i8_conversion, @@ -38,7 +37,10 @@ cast_to_common_type, concat_compat, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, +) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -147,16 +149,6 @@ def concat_arrays(to_concat: list) -> ArrayLike: else: target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) - if target_dtype.kind in ["m", "M"]: - # for datetimelike use DatetimeArray/TimedeltaArray concatenation - # don't use arr.astype(target_dtype, copy=False), because that doesn't - # work for DatetimeArray/TimedeltaArray (returns ndarray) - to_concat = [ - arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr - for arr in to_concat - ] - return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0) - to_concat = [ arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) @@ -471,7 +463,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if len(values) and values[0] is None: fill_value = None - if is_datetime64tz_dtype(empty_dtype): + if isinstance(empty_dtype, DatetimeTZDtype): + # NB: exclude e.g. pyarrow[dt64tz] dtypes i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index fe51230961327..995a31720c4ad 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -50,7 +50,6 @@ is_bool, is_bool_dtype, is_categorical_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_float_dtype, @@ -62,6 +61,7 @@ is_object_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -1349,12 +1349,12 @@ def _maybe_coerce_merge_keys(self) -> None: raise ValueError(msg) elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): raise ValueError(msg) - elif is_datetime64tz_dtype(lk.dtype) and not is_datetime64tz_dtype( - rk.dtype + elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( + rk.dtype, DatetimeTZDtype ): raise ValueError(msg) - elif not is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype( - rk.dtype + elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( + rk.dtype, DatetimeTZDtype ): raise ValueError(msg) @@ -2280,9 +2280,10 @@ def _factorize_keys( rk = extract_array(rk, extract_numpy=True, extract_range=True) # TODO: if either is a RangeIndex, we can likely factorize more efficiently? - if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): + if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared + # TODO(non-nano): need to make sure resolutions match lk = cast("DatetimeArray", lk)._ndarray rk = cast("DatetimeArray", rk)._ndarray diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 27094fff5f812..ff631a95e6846 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -68,7 +68,6 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_float, is_float_dtype, @@ -79,6 +78,7 @@ is_scalar, is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import ( isna, notna, @@ -1290,7 +1290,7 @@ def format_array( fmt_klass: type[GenericArrayFormatter] if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter - elif is_datetime64tz_dtype(values.dtype): + elif isinstance(values.dtype, DatetimeTZDtype): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a5da960427fe7..ce30e3e92a4c9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -539,8 +539,6 @@ def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - # TODO: Is there a better way to get the "object" ID for groupby_apply_op? - is_object = "object" in request.node.nodeid if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( @@ -548,16 +546,6 @@ def test_groupby_extension_apply( reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - if is_object: - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -688,70 +676,10 @@ def test_dropna_array(self, data_missing): class TestBasePrinting(base.BasePrintingTests): - def test_series_repr(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_date(pa_dtype) - or pa.types.is_duration(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) - super().test_series_repr(data) - - def test_dataframe_repr(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_date(pa_dtype) - or pa.types.is_duration(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) - super().test_dataframe_repr(data) + pass class TestBaseReshaping(base.BaseReshapingTests): - @pytest.mark.parametrize("in_frame", [True, False]) - def test_concat(self, data, in_frame, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_date(pa_dtype) - or pa.types.is_duration(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) - super().test_concat(data, in_frame) - - @pytest.mark.parametrize("in_frame", [True, False]) - def test_concat_all_na_block(self, data_missing, in_frame, request): - pa_dtype = data_missing.dtype.pyarrow_dtype - if ( - pa.types.is_date(pa_dtype) - or pa.types.is_duration(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) - super().test_concat_all_na_block(data_missing, in_frame) - def test_concat_columns(self, data, na_value, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): @@ -772,26 +700,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value, request): ) super().test_concat_extension_arrays_copy_false(data, na_value) - def test_concat_with_reindex(self, data, request, using_array_manager): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="GH 47514: _concat_datetime expects axis arg.", - ) - ) - elif pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError if not using_array_manager else TypeError, - reason="GH 34986", - ) - ) - super().test_concat_with_reindex(data) - def test_align(self, data, na_value, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): @@ -832,32 +740,6 @@ def test_merge(self, data, na_value, request): ) super().test_merge(data, na_value) - def test_merge_on_extension_array(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_merge_on_extension_array(data) - - def test_merge_on_extension_array_duplicates(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_merge_on_extension_array_duplicates(data) - def test_ravel(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): From e020bb616498cffae26fdf682dee376a24323a24 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 14 Sep 2022 19:30:31 +0200 Subject: [PATCH 072/136] Backport PR #48543 on branch 1.5.x (DOC: Update footer and include OVH) (#48548) Backport PR #48543: DOC: Update footer and include OVH Co-authored-by: Marc Garcia --- doc/source/conf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 84958595f6618..3225582269b64 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -163,7 +163,11 @@ # General information about the project. project = "pandas" -copyright = f"2008-{datetime.now().year}, the pandas development team" +copyright = ( + f"{datetime.now().year} " + 'pandas via NumFOCUS, Inc. ' + 'Hosted by OVH Cloud' +) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the From e5ab0abd0c3fc537c9a22537490c906d000ca851 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:03:57 -0700 Subject: [PATCH 073/136] Manual Backport PR #48417 on branch 1.5.x (Revert set_index inplace and copy keyword changes) (#48552) Backport PR #48417: Revert set_index inplace and copy keyword changes Co-authored-by: Joris Van den Bossche --- doc/source/user_guide/indexing.rst | 5 ++- doc/source/whatsnew/v1.5.0.rst | 2 - pandas/core/frame.py | 37 ++----------------- pandas/core/reshape/merge.py | 19 ++++------ pandas/io/parsers/arrow_parser_wrapper.py | 2 +- pandas/io/pytables.py | 2 +- pandas/io/sql.py | 6 +-- .../tests/frame/methods/test_combine_first.py | 4 +- pandas/tests/frame/methods/test_set_index.py | 26 +------------ pandas/tests/frame/test_api.py | 3 +- pandas/tests/frame/test_query_eval.py | 18 +++------ pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_function.py | 4 +- pandas/tests/indexes/multi/test_reshape.py | 4 +- .../indexing/multiindex/test_indexing_slow.py | 7 +--- .../indexing/multiindex/test_multiindex.py | 2 +- pandas/tests/io/pytables/test_append.py | 4 +- pandas/tests/io/test_sql.py | 4 +- pandas/tests/plotting/frame/test_frame.py | 4 +- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/reshape/merge/test_multi.py | 4 +- pandas/tests/series/indexing/test_indexing.py | 7 +--- pandas/tests/window/test_rolling.py | 2 +- 24 files changed, 51 insertions(+), 121 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 92729a16c6a30..f939945fc6cda 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1723,12 +1723,13 @@ the given columns to a MultiIndex: frame Other options in ``set_index`` allow you not drop the index columns or to add -the index without creating a copy of the underlying data: +the index in-place (without creating a new object): .. ipython:: python data.set_index('c', drop=False) - data.set_index(['a', 'b'], copy=False) + data.set_index(['a', 'b'], inplace=True) + data Reset the index ~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d8a319da2065e..4d717aa45ccea 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -330,7 +330,6 @@ Other enhancements - :meth:`DataFrame.quantile` gained a ``method`` argument that can accept ``table`` to evaluate multi-column quantiles (:issue:`43881`) - :class:`Interval` now supports checking whether one interval is contained by another interval (:issue:`46613`) - Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`) -- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`) - The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`) - The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`) @@ -934,7 +933,6 @@ Other Deprecations - Deprecated the ``inplace`` keyword in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis`, use ``obj = obj.set_axis(..., copy=False)`` instead (:issue:`48130`) - Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`) - Fixed up warning message of deprecation of :meth:`MultiIndex.lesort_depth` as public method, as the message previously referred to :meth:`MultiIndex.is_lexsorted` instead (:issue:`38701`) -- Deprecated the ``inplace`` keyword in :meth:`DataFrame.set_index`, use ``df = df.set_index(..., copy=False)`` instead (:issue:`48115`) - Deprecated the ``sort_columns`` argument in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`). - Deprecated positional arguments for all but the first argument of :meth:`DataFrame.to_stata` and :func:`read_stata`, use keyword arguments instead (:issue:`48128`). - Deprecated the ``mangle_dupe_cols`` argument in :func:`read_csv`, :func:`read_fwf`, :func:`read_table` and :func:`read_excel`. The argument was never implemented, and a new argument where the renaming pattern can be specified will be added instead (:issue:`47718`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc2df4c1179e8..1f3f68c6d409b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5855,9 +5855,8 @@ def set_index( *, drop: bool = ..., append: bool = ..., - inplace: Literal[False] | lib.NoDefault = ..., + inplace: Literal[False] = ..., verify_integrity: bool = ..., - copy: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -5870,7 +5869,6 @@ def set_index( append: bool = ..., inplace: Literal[True], verify_integrity: bool = ..., - copy: bool | lib.NoDefault = ..., ) -> None: ... @@ -5880,9 +5878,8 @@ def set_index( keys, drop: bool = True, append: bool = False, - inplace: bool | lib.NoDefault = lib.no_default, + inplace: bool = False, verify_integrity: bool = False, - copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | None: """ Set the DataFrame index using existing columns. @@ -5905,18 +5902,10 @@ def set_index( Whether to append columns to existing index. inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. - - .. deprecated:: 1.5.0 - verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this method. - copy : bool, default True - Whether to make a copy of the underlying data when returning a new - DataFrame. - - .. versionadded:: 1.5.0 Returns ------- @@ -5981,25 +5970,7 @@ def set_index( 3 9 7 2013 84 4 16 10 2014 31 """ - if inplace is not lib.no_default: - inplace = validate_bool_kwarg(inplace, "inplace") - warnings.warn( - "The 'inplace' keyword in DataFrame.set_index is deprecated " - "and will be removed in a future version. Use " - "`df = df.set_index(..., copy=False)` instead.", - FutureWarning, - stacklevel=find_stack_level(inspect.currentframe()), - ) - else: - inplace = False - - if inplace: - if copy is not lib.no_default: - raise ValueError("Cannot specify copy when inplace=True") - copy = False - elif copy is lib.no_default: - copy = True - + inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -6035,7 +6006,7 @@ def set_index( if inplace: frame = self else: - frame = self.copy(deep=copy) + frame = self.copy() arrays = [] names: list[Hashable] = [] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 995a31720c4ad..95a607c68110e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -783,9 +783,9 @@ def get_result(self, copy: bool = True) -> DataFrame: if self.indicator: result = self._indicator_post_merge(result) - result = self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_add_join_keys(result, left_indexer, right_indexer) - result = self._maybe_restore_index_levels(result) + self._maybe_restore_index_levels(result) self._maybe_drop_cross_column(result, self._cross) @@ -852,7 +852,7 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame: result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) return result - def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame: + def _maybe_restore_index_levels(self, result: DataFrame) -> None: """ Restore index levels specified as `on` parameters @@ -870,7 +870,7 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame: Returns ------- - DataFrame + None """ names_to_restore = [] for name, left_key, right_key in zip( @@ -894,15 +894,14 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame: names_to_restore.append(name) if names_to_restore: - result = result.set_index(names_to_restore, copy=False) - return result + result.set_index(names_to_restore, inplace=True) def _maybe_add_join_keys( self, result: DataFrame, left_indexer: np.ndarray | None, right_indexer: np.ndarray | None, - ) -> DataFrame: + ) -> None: left_has_missing = None right_has_missing = None @@ -993,12 +992,11 @@ def _maybe_add_join_keys( for level_name in result.index.names ] - result = result.set_index(idx_list, copy=False) + result.set_index(idx_list, inplace=True) else: result.index = Index(key_col, name=name) else: result.insert(i, name or f"key_{i}", key_col) - return result def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" @@ -1768,8 +1766,7 @@ def get_result(self, copy: bool = True) -> DataFrame: result = self._reindex_and_concat( join_index, left_join_indexer, right_join_indexer, copy=copy ) - - result = self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_add_join_keys(result, left_indexer, right_indexer) return result diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index bb62b1405da3a..2305c209936b6 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -117,7 +117,7 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") - frame = frame.set_index(self.index_col, drop=True, copy=False) + frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 77f35bc09abd3..96ba6b2e84cf3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4667,7 +4667,7 @@ def read( columns.insert(0, n) s = super().read(where=where, columns=columns, start=start, stop=stop) if is_multi_index: - s = s.set_index(self.levels, copy=False) + s.set_index(self.levels, inplace=True) s = s.iloc[:, 0] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 219b8105561ee..ee6564d103147 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -152,7 +152,7 @@ def _wrap_result( frame = _parse_date_columns(frame, parse_dates) if index_col is not None: - frame = frame.set_index(index_col, copy=False) + frame.set_index(index_col, inplace=True) return frame @@ -980,7 +980,7 @@ def _query_iterator( self._harmonize_columns(parse_dates=parse_dates) if self.index is not None: - self.frame = self.frame.set_index(self.index, copy=False) + self.frame.set_index(self.index, inplace=True) yield self.frame @@ -1021,7 +1021,7 @@ def read( self._harmonize_columns(parse_dates=parse_dates) if self.index is not None: - self.frame = self.frame.set_index(self.index, copy=False) + self.frame.set_index(self.index, inplace=True) return self.frame diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index c71b688d390d4..47ebca0b9bf5c 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -387,8 +387,8 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype ) df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) - df = df.set_index(["a", "b"], copy=False) - df2 = df2.set_index(["a", "b"], copy=False) + df.set_index(["a", "b"], inplace=True) + df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index b404c34a4ddb8..4c39cf99f18ff 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -25,27 +25,6 @@ class TestSetIndex: - def test_set_index_copy(self): - # GH#48043 - df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A")) - - res = df.set_index("A", copy=True) - tm.assert_frame_equal(res, expected) - assert not any(tm.shares_memory(df[col], res[col]) for col in res.columns) - - res = df.set_index("A", copy=False) - tm.assert_frame_equal(res, expected) - assert all(tm.shares_memory(df[col], res[col]) for col in res.columns) - - msg = "Cannot specify copy when inplace=True" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match="The 'inplace'"): - df.set_index("A", inplace=True, copy=True) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match="The 'inplace'"): - df.set_index("A", inplace=True, copy=False) - def test_set_index_multiindex(self): # segfault in GH#3308 d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} @@ -199,10 +178,7 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys): if inplace: result = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="The 'inplace' keyword" - ): - return_value = result.set_index(keys, drop=drop, inplace=True) + return_value = result.set_index(keys, drop=drop, inplace=True) assert return_value is None else: result = df.set_index(keys, drop=drop) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index cb97e2bfb6202..bc6c676568f73 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -244,8 +244,7 @@ def _check_f(base, f): # set_index f = lambda x: x.set_index("a", inplace=True) - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - _check_f(data.copy(), f) + _check_f(data.copy(), f) # reset_index f = lambda x: x.reset_index(inplace=True) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index aedc9270fd37c..35335c54cd41e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -436,8 +436,7 @@ def test_date_index_query(self): df = DataFrame(np.random.randn(n, 3)) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] @@ -450,8 +449,7 @@ def test_date_index_query_with_NaT(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] @@ -465,8 +463,7 @@ def test_date_index_query_with_NaT_duplicates(self): d["dates3"] = date_range("1/1/2014", periods=n) df = DataFrame(d) df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)] @@ -797,8 +794,7 @@ def test_date_index_query(self): df = DataFrame(np.random.randn(n, 3)) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None res = df.query( "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser @@ -813,8 +809,7 @@ def test_date_index_query_with_NaT(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None res = df.query( "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser @@ -829,8 +824,7 @@ def test_date_index_query_with_NaT_duplicates(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT - with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"): - return_value = df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) assert return_value is None msg = r"'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a66d13efce65..b064c12f89c21 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -678,7 +678,7 @@ def test_apply_groupby_datetimeindex(): result = df.groupby("Name").sum() expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]}) - expected = expected.set_index("Name", copy=False) + expected.set_index("Name", inplace=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7ba22c09cd26d..d813a2848a5dc 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -98,7 +98,7 @@ def test_builtins_apply(keys, f): if f != sum: expected = gb.agg(fname).reset_index() - expected = expected.set_index(keys, copy=False, drop=False) + expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) @@ -454,7 +454,7 @@ def test_groupby_non_arithmetic_agg_types(dtype, method, data): df_out = DataFrame(exp) df_out["b"] = df_out.b.astype(out_type) - df_out = df_out.set_index("a", copy=False) + df_out.set_index("a", inplace=True) grpd = df.groupby("a") t = getattr(grpd, method)(*data["args"]) diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index b1deec12b1adb..eed27cd450e9c 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -35,7 +35,7 @@ def test_insert(idx): idx.insert(0, ("foo2",)) left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"]) - left = left.set_index(["1st", "2nd"], copy=False) + left.set_index(["1st", "2nd"], inplace=True) ts = left["3rd"].copy(deep=True) left.loc[("b", "x"), "3rd"] = 2 @@ -65,7 +65,7 @@ def test_insert(idx): ], columns=["1st", "2nd", "3rd"], ) - right = right.set_index(["1st", "2nd"], copy=False) + right.set_index(["1st", "2nd"], inplace=True) # FIXME data types changes to float because # of intermediate nan insertion; tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index 16b0ae2c63eb1..e8c766d489813 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -60,18 +60,15 @@ def validate(mi, df, key): assert key[: i + 1] in mi.index right = df[mask].copy() - msg = "The 'inplace' keyword in DataFrame.set_index is deprecated" if i + 1 != len(key): # partial key return_value = right.drop(cols[: i + 1], axis=1, inplace=True) assert return_value is None - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = right.set_index(cols[i + 1 : -1], inplace=True) + return_value = right.set_index(cols[i + 1 : -1], inplace=True) assert return_value is None tm.assert_frame_equal(mi.loc[key[: i + 1]], right) else: # full key - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = right.set_index(cols[:-1], inplace=True) + return_value = right.set_index(cols[:-1], inplace=True) assert return_value is None if len(right) == 1: # single hit right = Series( diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 100b3e55b03c5..08e15545cb998 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -131,7 +131,7 @@ def test_multiindex_complex(self): "z": non_complex_data, } ) - result = result.set_index(["x", "y"], copy=False) + result.set_index(["x", "y"], inplace=True) expected = DataFrame( {"z": non_complex_data}, index=MultiIndex.from_arrays( diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 6c6ea4c8b0e0a..40a50c55de2a4 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -137,7 +137,7 @@ def test_append_series(setup_path): mi["B"] = np.arange(len(mi)) mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" - mi = mi.set_index(["C", "B"], copy=False) + mi.set_index(["C", "B"], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) store.append("mi", s) @@ -326,7 +326,7 @@ def test_append_with_different_block_ordering(setup_path): a = df.pop("A") df["A"] = a - df = df.set_index("index", copy=False) + df.set_index("index", inplace=True) store.append("df", df) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c2c47672b190d..ee55837324f20 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -771,7 +771,7 @@ def _roundtrip(self, test_frame1): assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - result = result.set_index("level_0", copy=False) + result.set_index("level_0", inplace=True) # result.index.astype(int) result.index.name = None @@ -928,7 +928,7 @@ def test_roundtrip(self, test_frame1): # HACK! result.index = test_frame1.index - result = result.set_index("level_0", copy=False) + result.set_index("level_0", inplace=True) result.index.astype(int) result.index.name = None tm.assert_frame_equal(result, test_frame1) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index f804f7df06bb8..b38c9adb0a893 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1550,8 +1550,8 @@ def test_errorbar_with_partial_columns(self): self._check_has_errorbars(ax, xerr=0, yerr=2) ix = date_range("1/1/2000", periods=10, freq="M") - df = df.set_index(ix, copy=False) - df_err = df_err.set_index(ix, copy=False) + df.set_index(ix, inplace=True) + df_err.set_index(ix, inplace=True) ax = _check_plot_works(df.plot, yerr=df_err, kind="line") self._check_has_errorbars(ax, xerr=0, yerr=2) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index f3d24fa53f7ed..6e87c221426c1 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -409,7 +409,7 @@ def test_join_hierarchical_mixed(self): df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) - other_df = other_df.set_index("a", copy=False) + other_df.set_index("a", inplace=True) # GH 9455, 12219 msg = "merging between different levels is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c7d7d1b0daa50..f172528041fb3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1347,7 +1347,7 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): ], columns=["a", "key", "b"], ) - expected = expected.set_index(expected_index, copy=False) + expected.set_index(expected_index, inplace=True) tm.assert_frame_equal(result, expected) def test_merge_right_index_right(self): diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index cc8019c50bc1e..0dbe45eeb1e82 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -130,7 +130,7 @@ def run_asserts(left, right, sort): left["4th"] = bind_cols(left) right["5th"] = -bind_cols(right) - right = right.set_index(icols, copy=False) + right.set_index(icols, inplace=True) run_asserts(left, right, sort) @@ -143,7 +143,7 @@ def run_asserts(left, right, sort): i = np.random.permutation(len(left)) right = left.iloc[i, :-1] right["5th"] = -bind_cols(right) - right = right.set_index(icols, copy=False) + right.set_index(icols, inplace=True) run_asserts(left, right, sort) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6ddffd0d006dc..5e87e36f21cca 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -252,9 +252,7 @@ def test_timedelta_assignment(): def test_underlying_data_conversion(using_copy_on_write): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) - msg = "The 'inplace' keyword" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = df.set_index(["a", "b", "c"], inplace=True) + return_value = df.set_index(["a", "b", "c"], inplace=True) assert return_value is None s = Series([1], index=[(2, 2, 2)]) df["val"] = 0 @@ -268,8 +266,7 @@ def test_underlying_data_conversion(using_copy_on_write): expected = DataFrame( {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} ) - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = expected.set_index(["a", "b", "c"], inplace=True) + return_value = expected.set_index(["a", "b", "c"], inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 943ffc10f52c8..c9ec2985488be 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -706,7 +706,7 @@ def test_rolling_window_as_string(center, expected_data): data = npr.randint(1, high=100, size=len(days)) df = DataFrame({"DateCol": days, "metric": data}) - df = df.set_index("DateCol", copy=False) + df.set_index("DateCol", inplace=True) result = df.rolling(window="21D", min_periods=2, closed="left", center=center)[ "metric" ].agg("max") From f51c2a03ba0dab0431182db366e95af70aadb2bd Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 02:04:43 +0200 Subject: [PATCH 074/136] Backport PR #48550 on branch 1.5.x (TST: remove 2D tests irrelevant for pyarrow) (#48554) Backport PR #48550: TST: remove 2D tests irrelevant for pyarrow Co-authored-by: jbrockmendel --- pandas/tests/extension/base/dim2.py | 3 +++ pandas/tests/extension/test_arrow.py | 14 -------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index f71f3cf164bfc..1d5a5c4532a5d 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -12,6 +12,9 @@ class Dim2CompatTests(BaseExtensionTests): + # Note: these are ONLY for ExtensionArray subclasses that support 2D arrays. + # i.e. not for pyarrow-backed EAs. + def test_transpose(self, data): arr2d = data.repeat(2).reshape(-1, 2) shape = arr2d.shape diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ce30e3e92a4c9..62e9503286311 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -319,20 +319,6 @@ def test_from_sequence_of_strings_pa_array(self, data, request): tm.assert_extension_array_equal(result, data) -@pytest.mark.xfail( - raises=NotImplementedError, reason="pyarrow.ChunkedArray backing is 1D." -) -class TestDim2Compat(base.Dim2CompatTests): - pass - - -@pytest.mark.xfail( - raises=NotImplementedError, reason="pyarrow.ChunkedArray backing is 1D." -) -class TestNDArrayBacked2D(base.NDArrayBacked2DTests): - pass - - class TestGetitemTests(base.BaseGetitemTests): @pytest.mark.xfail( reason=( From b160966ecfe764c45c3b7b92f03b9c550ed5f2b6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 09:33:54 +0200 Subject: [PATCH 075/136] Backport PR #48556 on branch 1.5.x (DOC: Fix docs footer) (#48558) Backport PR #48556: DOC: Fix docs footer Co-authored-by: Marc Garcia --- doc/_templates/pandas_footer.html | 3 +++ doc/source/conf.py | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 doc/_templates/pandas_footer.html diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html new file mode 100644 index 0000000000000..c24bce52e67e0 --- /dev/null +++ b/doc/_templates/pandas_footer.html @@ -0,0 +1,3 @@ + diff --git a/doc/source/conf.py b/doc/source/conf.py index 3225582269b64..e7e64315a502f 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -163,11 +163,8 @@ # General information about the project. project = "pandas" -copyright = ( - f"{datetime.now().year} " - 'pandas via NumFOCUS, Inc. ' - 'Hosted by OVH Cloud' -) +# We have our custom "pandas_footer.html" template, using copyright for the current year +copyright = f"{datetime.now().year}" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -243,6 +240,7 @@ html_theme_options = { "external_links": [], + "footer_items": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", "google_analytics_id": "UA-27880019-2", From 307ce80e5e9ddb863f13571b7e5dc62b0108930c Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 17:19:16 +0200 Subject: [PATCH 076/136] Backport PR #48562 on branch 1.5.x (TST: Testing that no warnings are emitted and that inplace fillna produces the correct result (GH48480)) (#48564) Backport PR #48562: TST: Testing that no warnings are emitted and that inplace fillna produces the correct result (GH48480) Co-authored-by: RaphSku <45042665+RaphSku@users.noreply.github.com> --- pandas/tests/frame/methods/test_fillna.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 3d7e5c6823e9d..697b28a65ac2e 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -780,3 +780,16 @@ def test_fillna_nonconsolidated_frame(): df_nonconsol = df.pivot(index="i1", columns="i2") result = df_nonconsol.fillna(0) assert result.isna().sum().sum() == 0 + + +def test_fillna_nones_inplace(): + # GH 48480 + df = DataFrame( + [[None, None], [None, None]], + columns=["A", "B"], + ) + with tm.assert_produces_warning(False): + df.fillna(value={"A": 1, "B": 2}, inplace=True) + + expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) + tm.assert_frame_equal(df, expected) From 6ee47f96290516117635deaccb11f4691a34c89a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 17:19:27 +0200 Subject: [PATCH 077/136] Backport PR #48563 on branch 1.5.x (DOC: Fix read_sas 1.5 release notes) (#48565) Backport PR #48563: DOC: Fix read_sas 1.5 release notes Co-authored-by: Jonas Haag --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4d717aa45ccea..581cc9b5f61a9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -965,7 +965,7 @@ Performance improvements - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) -- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) +- Performance improvement to :func:`read_sas` (:issue:`47404`) - Performance improvement in ``argmax`` and ``argmin`` for :class:`arrays.SparseArray` (:issue:`34197`) - From 19a3f5ae50cbf4c1d78268755ee67e623a7de580 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 20:53:20 +0200 Subject: [PATCH 078/136] Backport PR #48539 on branch 1.5.x (REGR: groupby doesn't identify null values when sort=False) (#48568) Backport PR #48539: REGR: groupby doesn't identify null values when sort=False Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.5.0.rst | 1 - pandas/core/algorithms.py | 11 ++++++++++ pandas/tests/groupby/test_groupby_dropna.py | 23 ++++++++++++++++++++- pandas/tests/test_algos.py | 12 +++++------ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 581cc9b5f61a9..bed545fccba1c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1016,7 +1016,6 @@ Numeric - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) -- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e6737b2e61aa..9b16032a1d418 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -566,6 +566,17 @@ def factorize_array( hash_klass, values = _get_hashtable_algo(values) + # factorize can now handle differentiating various types of null values. + # However, for backwards compatibility we only use the null for the + # provided dtype. This may be revisited in the future, see GH#48476. + null_mask = isna(values) + if null_mask.any(): + na_value = na_value_for_dtype(values.dtype, compat=False) + # Don't modify (potentially user-provided) array + # error: No overload variant of "where" matches argument types "Any", "object", + # "ndarray[Any, Any]" + values = np.where(null_mask, na_value, values) # type: ignore[call-overload] + table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( values, diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 394b5adcf0370..b2426ffa9dad3 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,6 +3,8 @@ from pandas.compat.pyarrow import pa_version_under1p01 +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd import pandas._testing as tm @@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index(): ( [ pd.Period("2012-02-01", freq="D"), - pd.NA, + pd.NaT, pd.Period("2012-01-01", freq="D"), pd.Period("2012-02-01", freq="D"), ], @@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series): # TODO: Slicing reorders categories? expected.index = expected.index.reorder_categories(["y", "x"]) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("dtype", [object, None]) +def test_null_is_null_for_dtype( + sort, dtype, nulls_fixture, nulls_fixture2, test_series +): + # GH#48506 - groups should always result in using the null for the dtype + df = pd.DataFrame({"a": [1, 2]}) + groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype) + obj = df["a"] if test_series else df + gb = obj.groupby(groups, dropna=False, sort=sort) + result = gb.sum() + index = pd.Index([na_value_for_dtype(groups.dtype)]) + expected = pd.DataFrame({"a": [3]}, index=index) + if test_series: + tm.assert_series_equal(result, expected["a"]) + else: + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2bdd9dc8fb7b4..80271c13cd35d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ( ["a", None, "b", "a"], np.array([0, 1, 2, 0], dtype=np.dtype("intp")), - np.array(["a", None, "b"], dtype=object), + np.array(["a", np.nan, "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], @@ -482,8 +482,8 @@ def test_object_factorize_use_na_sentinel_false( ): codes, uniques = algos.factorize(data, use_na_sentinel=False) - tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) + tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) @pytest.mark.parametrize( "data, expected_codes, expected_uniques", @@ -491,7 +491,7 @@ def test_object_factorize_use_na_sentinel_false( ( [1, None, 1, 2], np.array([0, 1, 0, 2], dtype=np.dtype("intp")), - np.array([1, None, 2], dtype="O"), + np.array([1, np.nan, 2], dtype="O"), ), ( [1, np.nan, 1, 2], @@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false( ): codes, uniques = algos.factorize(data, use_na_sentinel=False) - tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) + tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) class TestUnique: From 1fb8811ece22d71727fdf597af19e069519cdae7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 20:53:36 +0200 Subject: [PATCH 079/136] Backport PR #48555 on branch 1.5.x (DEPR: Series.astype(np.datetime64)) (#48569) Backport PR #48555: DEPR: Series.astype(np.datetime64) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/arrays/datetimes.py | 16 ++++++++++++++++ pandas/core/indexes/base.py | 6 ++++++ pandas/tests/series/methods/test_astype.py | 13 +++++++++++++ 4 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index bed545fccba1c..bb9b052cd6e00 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -936,6 +936,7 @@ Other Deprecations - Deprecated the ``sort_columns`` argument in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`). - Deprecated positional arguments for all but the first argument of :meth:`DataFrame.to_stata` and :func:`read_stata`, use keyword arguments instead (:issue:`48128`). - Deprecated the ``mangle_dupe_cols`` argument in :func:`read_csv`, :func:`read_fwf`, :func:`read_table` and :func:`read_excel`. The argument was never implemented, and a new argument where the renaming pattern can be specified will be added instead (:issue:`47718`) +- Deprecated allowing ``dtype='datetime64'`` or ``dtype=np.datetime64`` in :meth:`Series.astype`, use "datetime64[ns]" instead (:issue:`47844`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 64c15df64de3b..6b1f0bfac8f7a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -630,6 +630,22 @@ def astype(self, dtype, copy: bool = True): return type(self)._simple_new(res_values, dtype=dtype) # TODO: preserve freq? + elif ( + self.tz is None + and is_datetime64_dtype(dtype) + and dtype != self.dtype + and is_unitless(dtype) + ): + # TODO(2.0): just fall through to dtl.DatetimeLikeArrayMixin.astype + warnings.warn( + "Passing unit-less datetime64 dtype to .astype is deprecated " + "and will raise in a future version. Pass 'datetime64[ns]' instead", + FutureWarning, + stacklevel=find_stack_level(inspect.currentframe()), + ) + # unit conversion e.g. datetime64[s] + return self._ndarray.astype(dtype) + elif is_period_dtype(dtype): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ace3df3ae97a4..642a01821cb17 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -42,6 +42,7 @@ IncompatibleFrequency, OutOfBoundsDatetime, Timestamp, + is_unitless, tz_compare, ) from pandas._typing import ( @@ -1085,6 +1086,11 @@ def astype(self, dtype, copy: bool = True): values = self._data if isinstance(values, ExtensionArray): + if isinstance(dtype, np.dtype) and dtype.kind == "M" and is_unitless(dtype): + # TODO(2.0): remove this special-casing once this is enforced + # in DTA.astype + raise TypeError(f"Cannot cast {type(self).__name__} to dtype") + with rewrite_exception(type(values).__name__, type(self).__name__): new_values = values.astype(dtype, copy=copy) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 47d6cad0e1743..498225307b52e 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -30,6 +30,19 @@ class TestAstypeAPI: + def test_astype_unitless_dt64_deprecated(self): + # GH#47844 + ser = Series(["1970-01-01", "1970-01-01", "1970-01-01"], dtype="datetime64[ns]") + + msg = "Passing unit-less datetime64 dtype to .astype is deprecated and " + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.astype(np.datetime64) + tm.assert_series_equal(ser, res) + + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.astype("datetime64") + tm.assert_series_equal(ser, res) + def test_arg_for_errors_in_astype(self): # see GH#14878 ser = Series([1, 2, 3]) From 9e8d859f46abe33896ed9df8d82587f915cc2529 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 15 Sep 2022 22:04:13 +0200 Subject: [PATCH 080/136] Backport PR #48557 on branch 1.5.x (WEB: Add new footer to web) (#48571) Backport PR #48557: WEB: Add new footer to web Co-authored-by: Marc Garcia --- web/pandas/_templates/layout.html | 2 +- web/pandas/config.yml | 1 + web/pandas_web.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 67876d88a2d1a..0829bec043b2b 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -91,7 +91,7 @@

    - pandas is a fiscally sponsored project of NumFOCUS. + © {{ current_year }} pandas via NumFOCUS, Inc. Hosted by OVH Cloud.

    diff --git a/web/pandas/config.yml b/web/pandas/config.yml index b33ea5b0dd972..df31b1a4fde19 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -7,6 +7,7 @@ main: - try.md # the binder page will be added later github_repo_url: pandas-dev/pandas context_preprocessors: + - pandas_web.Preprocessors.current_year - pandas_web.Preprocessors.navbar_add_info - pandas_web.Preprocessors.blog_add_posts - pandas_web.Preprocessors.maintainers_add_info diff --git a/web/pandas_web.py b/web/pandas_web.py index 290443d1d2970..62539574543a9 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -54,6 +54,15 @@ class Preprocessors: anything else needed just be added with context preprocessors. """ + @staticmethod + def current_year(context): + """ + Add the current year to the context, so it can be used for the copyright + note, or other places where it is needed. + """ + context["current_year"] = datetime.datetime.now().year + return context + @staticmethod def navbar_add_info(context): """ From 27fd6c6c5141c5946470f4810858eb903491d651 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Sep 2022 21:56:36 +0200 Subject: [PATCH 081/136] Backport PR #48285 on branch 1.5.x (WEB: Unpin pydata sphinx theme) (#48585) Backport PR #48285: WEB: Unpin pydata sphinx theme Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/_templates/sidebar-nav-bs.html | 4 ++-- doc/source/conf.py | 4 ++-- environment.yml | 2 +- requirements-dev.txt | 2 +- web/pandas/versions.json | 21 ++++++++++++++------- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html index 7e0043e771e72..f592a910dfbda 100644 --- a/doc/_templates/sidebar-nav-bs.html +++ b/doc/_templates/sidebar-nav-bs.html @@ -1,9 +1,9 @@ diff --git a/doc/source/conf.py b/doc/source/conf.py index e7e64315a502f..8740f6aa7eef6 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -244,10 +244,10 @@ "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", "google_analytics_id": "UA-27880019-2", + "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, "navbar_end": ["version-switcher", "navbar-icon-links"], "switcher": { - "json_url": "https://pandas.pydata.org/versions.json", - "url_template": "https://pandas.pydata.org/{version}/", + "json_url": "/versions.json", "version_match": switcher_version, }, } diff --git a/environment.yml b/environment.yml index 90b6694a392bf..5adcf0dd216d8 100644 --- a/environment.yml +++ b/environment.yml @@ -99,7 +99,7 @@ dependencies: - natsort # DataFrame.sort_values doctest - numpydoc - pandas-dev-flaker=0.5.0 - - pydata-sphinx-theme=0.8.0 + - pydata-sphinx-theme - pytest-cython # doctest - sphinx - sphinx-panels diff --git a/requirements-dev.txt b/requirements-dev.txt index 39118b750fa8c..8be19d85f9515 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -76,7 +76,7 @@ gitdb natsort numpydoc pandas-dev-flaker==0.5.0 -pydata-sphinx-theme==0.8.0 +pydata-sphinx-theme pytest-cython sphinx sphinx-panels diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 3085efe02738b..2b30006db0c4d 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -1,30 +1,37 @@ [ { "name": "dev", - "version": "docs/dev" + "version": "dev", + "url": "/docs/dev/" }, { "name": "1.4 (stable)", - "version": "docs" + "version": "1.4 (stable)", + "url": "/docs/" }, { "name": "1.4", - "version": "pandas-docs/version/1.4" + "version": "1.4", + "url": "/pandas-docs/version/1.4/" }, { "name": "1.3", - "version": "pandas-docs/version/1.3" + "version": "1.3", + "url": "/pandas-docs/version/1.3/" }, { "name": "1.2", - "version": "pandas-docs/version/1.2" + "version": "1.2", + "url": "/pandas-docs/version/1.2/" }, { "name": "1.1", - "version": "pandas-docs/version/1.1" + "version": "1.1", + "url": "/pandas-docs/version/1.1/" }, { "name": "1.0", - "version": "pandas-docs/version/1.0" + "version": "1.0", + "url": "/pandas-docs/version/1.0/" } ] From 3f91207dca8971c723605243f6bc113c739ba637 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Sep 2022 21:56:48 +0200 Subject: [PATCH 082/136] Backport PR #48572 on branch 1.5.x (DOC: Fixing styles for the dark theme) (#48584) Backport PR #48572: DOC: Fixing styles for the dark theme Co-authored-by: Marc Garcia --- doc/source/_static/css/getting_started.css | 7 +++---- doc/source/_static/css/pandas.css | 8 ++++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index 84eafa308175c..2a348e5b84e6e 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -154,7 +154,7 @@ ul.task-bullet > li > p:first-child { .comparison-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-block { @@ -163,19 +163,18 @@ ul.task-bullet > li > p:first-child { .install-card .card-header { border: none; - background-color:white; + background-color: transparent; padding: 1rem 1rem 0rem 1rem; } .install-card .card-header p.card-text { - color: #150458; font-size: 1.1rem; font-weight: bold; } .install-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-card pre { diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 452c7d20ff5df..25153b6a8ad5d 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -25,7 +25,7 @@ .intro-card .card-header { border: none; - background-color:white; + background-color: transparent; color: #150458 !important; font-size: var(--pst-font-size-h5); font-weight: bold; @@ -34,7 +34,7 @@ .intro-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .intro-card .card-footer p.card-text{ @@ -42,3 +42,7 @@ margin-left: auto; margin-right: auto; } + +.card, .card img { + background-color: transparent !important; +} From dfc00bfc5d98f8e2c63356e6a415da8ab7a7b436 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Sep 2022 21:57:03 +0200 Subject: [PATCH 083/136] Backport PR #48397 on branch 1.5.x (WARN: Remove false positive warning for iloc inplaceness) (#48583) Backport PR #48397: WARN: Remove false positive warning for iloc inplaceness Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_testing/__init__.py | 3 ++- pandas/core/dtypes/cast.py | 5 ++++- pandas/core/indexing.py | 16 ++++++---------- pandas/tests/frame/indexing/test_indexing.py | 3 ++- pandas/tests/frame/methods/test_diff.py | 5 ++--- pandas/tests/frame/test_nonunique_indexes.py | 6 +++++- 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 1035fd08a1a36..c15e597558221 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -459,7 +459,8 @@ def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]: def make_rand_series(name=None, dtype=np.float64) -> Series: index = makeStringIndex(_N) data = np.random.randn(_N) - data = data.astype(dtype, copy=False) + with np.errstate(invalid="ignore"): + data = data.astype(dtype, copy=False) return Series(data, index=index, name=name) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4244217da7865..ceb3d16949f91 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1970,7 +1970,10 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if tipo.kind not in ["i", "u"]: if isinstance(element, np.ndarray) and element.dtype.kind == "f": # If all can be losslessly cast to integers, then we can hold them - casted = element.astype(dtype) + with np.errstate(invalid="ignore"): + # We check afterwards if cast was losslessly, so no need to show + # the warning + casted = element.astype(dtype) comp = casted == element if comp.all(): # Return the casted values bc they can be passed to diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0fb499088868a..87ab0d94a7ce0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1994,21 +1994,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): self.obj._clear_item_cache() return + self.obj._iset_item(loc, value) + # We will not operate in-place, but will attempt to in the future. # To determine whether we need to issue a FutureWarning, see if the # setting in-place would work, i.e. behavior will change. - if isinstance(value, ABCSeries): - warn = can_hold_element(orig_values, value._values) - else: - warn = can_hold_element(orig_values, value) - # Don't issue the warning yet, as we can still trim a few cases where - # behavior will not change. - - self.obj._iset_item(loc, value) + new_values = self.obj._get_column_array(loc) - if warn: - new_values = self.obj._get_column_array(loc) + if can_hold_element(orig_values, new_values): + # Don't issue the warning yet, as we can still trim a few cases where + # behavior will not change. if ( isinstance(new_values, np.ndarray) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 65ac5e2c2bd1e..7f84d8a367eac 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1326,7 +1326,8 @@ def test_loc_expand_empty_frame_keep_midx_names(self): def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) - df.loc[:, "a"] = DataFrame({"a": [val, 11]}, index=[1, 2]) + with tm.assert_produces_warning(None): + df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) expected = DataFrame({"a": [np.nan, val]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index fc804836f9a9b..9a9fea3462752 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -82,7 +82,7 @@ def test_diff_datetime_axis0_with_nat(self, tz): expected = Series(ex_index).to_frame() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("tz", [pytest.param(None, marks=pytest.mark.xfail), "UTC"]) def test_diff_datetime_with_nat_zero_periods(self, tz): # diff on NaT values should give NaT, not timedelta64(0) dti = date_range("2016-01-01", periods=4, tz=tz) @@ -91,8 +91,7 @@ def test_diff_datetime_with_nat_zero_periods(self, tz): df[1] = ser.copy() - msg = "will attempt to set the values inplace instead" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(None): df.iloc[:, 0] = pd.NaT expected = df - df diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index cd6397b053803..2c28800fb181f 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import ( DataFrame, @@ -320,7 +322,9 @@ def test_dup_columns_across_dtype(self): def test_set_value_by_index(self, using_array_manager): # See gh-12344 - warn = FutureWarning if using_array_manager else None + warn = ( + FutureWarning if using_array_manager and not is_platform_windows() else None + ) msg = "will attempt to set the values inplace" df = DataFrame(np.arange(9).reshape(3, 3).T) From aabf6597f45436e9ada915ac15d3708f9d4948ca Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 17 Sep 2022 16:24:16 +0200 Subject: [PATCH 084/136] Backport PR #48587 on branch 1.5.x (Fix `series.str.startswith(tuple)`) (#48593) Backport PR #48587: Fix `series.str.startswith(tuple)` Co-authored-by: Janosh Riebesell --- pandas/core/strings/accessor.py | 41 +++++++++++++++++------ pandas/tests/strings/test_find_replace.py | 14 ++++---- pandas/tests/strings/test_strings.py | 2 +- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 44ebfbd7f3e9c..0ee9f15c3630c 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -18,6 +18,7 @@ from pandas._typing import ( DtypeObj, F, + Scalar, ) from pandas.util._decorators import ( Appender, @@ -2287,7 +2288,9 @@ def count(self, pat, flags=0): return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) - def startswith(self, pat, na=None): + def startswith( + self, pat: str | tuple[str, ...], na: Scalar | None = None + ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2295,8 +2298,9 @@ def startswith(self, pat, na=None): Parameters ---------- - pat : str - Character sequence. Regular expressions are not accepted. + pat : str or tuple[str, ...] + Character sequence or tuple of strings. Regular expressions are not + accepted. na : object, default NaN Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. @@ -2331,6 +2335,13 @@ def startswith(self, pat, na=None): 3 NaN dtype: object + >>> s.str.startswith(('b', 'B')) + 0 True + 1 True + 2 False + 3 NaN + dtype: object + Specifying `na` to be `False` instead of `NaN`. >>> s.str.startswith('b', na=False) @@ -2340,14 +2351,16 @@ def startswith(self, pat, na=None): 3 False dtype: bool """ - if not isinstance(pat, str): - msg = f"expected a string object, not {type(pat).__name__}" + if not isinstance(pat, (str, tuple)): + msg = f"expected a string or tuple, not {type(pat).__name__}" raise TypeError(msg) result = self._data.array._str_startswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) - def endswith(self, pat, na=None): + def endswith( + self, pat: str | tuple[str, ...], na: Scalar | None = None + ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2355,8 +2368,9 @@ def endswith(self, pat, na=None): Parameters ---------- - pat : str - Character sequence. Regular expressions are not accepted. + pat : str or tuple[str, ...] + Character sequence or tuple of strings. Regular expressions are not + accepted. na : object, default NaN Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. @@ -2391,6 +2405,13 @@ def endswith(self, pat, na=None): 3 NaN dtype: object + >>> s.str.endswith(('t', 'T')) + 0 True + 1 False + 2 True + 3 NaN + dtype: object + Specifying `na` to be `False` instead of `NaN`. >>> s.str.endswith('t', na=False) @@ -2400,8 +2421,8 @@ def endswith(self, pat, na=None): 3 False dtype: bool """ - if not isinstance(pat, str): - msg = f"expected a string object, not {type(pat).__name__}" + if not isinstance(pat, (str, tuple)): + msg = f"expected a string or tuple, not {type(pat).__name__}" raise TypeError(msg) result = self._data.array._str_endswith(pat, na=na) return self._wrap_result(result, returns_string=False) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 1c74950e30c40..62f9478bf25ff 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -291,21 +291,22 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], dtype=dtype, ) - result = values.str.startswith("foo") + result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) tm.assert_series_equal(result, exp) - result = values.str.startswith("foo", na=na) + result = values.str.startswith(pat, na=na) exp = Series([False, na, True, False, False, na, True]) tm.assert_series_equal(result, exp) @@ -351,21 +352,22 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- +@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], dtype=dtype, ) - result = values.str.endswith("foo") + result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) tm.assert_series_equal(result, exp) - result = values.str.endswith("foo", na=na) + result = values.str.endswith(pat, na=na) exp = Series([False, na, False, False, True, na, True]) tm.assert_series_equal(result, exp) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index ffa8b557d2379..4b25752940418 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -26,7 +26,7 @@ def test_startswith_endswith_non_str_patterns(pattern): # GH3485 ser = Series(["foo", "bar"]) - msg = f"expected a string object, not {type(pattern).__name__}" + msg = f"expected a string or tuple, not {type(pattern).__name__}" with pytest.raises(TypeError, match=msg): ser.str.startswith(pattern) with pytest.raises(TypeError, match=msg): From 4fbb05591979055708162994e96fb4c61cf2a8ab Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 18 Sep 2022 16:30:53 +0200 Subject: [PATCH 085/136] Backport PR #48601 on branch 1.5.x (CI: Fix matplolib release issues) (#48617) Backport PR #48601: CI: Fix matplolib release issues Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/visualization.rst | 1 + pandas/plotting/_core.py | 2 +- pandas/plotting/_matplotlib/compat.py | 1 + pandas/plotting/_matplotlib/core.py | 21 +++++++-- pandas/plotting/_matplotlib/style.py | 8 +++- pandas/plotting/_misc.py | 44 +++++++++---------- pandas/tests/plotting/frame/test_frame.py | 26 +++++++---- .../tests/plotting/frame/test_frame_color.py | 4 +- pandas/tests/plotting/test_datetimelike.py | 7 +++ pandas/tests/plotting/test_hist_method.py | 6 +++ pandas/tests/plotting/test_series.py | 16 +++++-- 11 files changed, 94 insertions(+), 42 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 5ce2f7ca599a7..147981f29476f 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -625,6 +625,7 @@ To plot multiple column groups in a single axes, repeat ``plot`` method specifyi It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. .. ipython:: python + :okwarning: ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 96f9abb301471..1fcefe884dec4 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1016,7 +1016,7 @@ def __call__(self, *args, **kwargs): >>> s = pd.Series([1, 3, 2]) >>> s.plot.line() - + .. plot:: :context: close-figs diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index 6015662999a7d..86b218db4ebe6 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -19,3 +19,4 @@ def inner(): mpl_ge_3_4_0 = _mpl_version("3.4.0", operator.ge) mpl_ge_3_5_0 = _mpl_version("3.5.0", operator.ge) +mpl_ge_3_6_0 = _mpl_version("3.6.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 0b6e5b346062a..bacf19df06205 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -14,6 +14,7 @@ ) import warnings +import matplotlib as mpl from matplotlib.artist import Artist import numpy as np @@ -54,6 +55,7 @@ from pandas.core.frame import DataFrame from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.misc import unpack_single_str_list @@ -1205,9 +1207,6 @@ def _make_plot(self): color_by_categorical = c_is_column and is_categorical_dtype(self.data[c]) - # pandas uses colormap, matplotlib uses cmap. - cmap = self.colormap or "Greys" - cmap = self.plt.cm.get_cmap(cmap) color = self.kwds.pop("color", None) if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") @@ -1222,6 +1221,17 @@ def _make_plot(self): else: c_values = c + # cmap is only used if c_values are integers, otherwise UserWarning + if is_integer_dtype(c_values): + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or "Greys" + if mpl_ge_3_6_0(): + cmap = mpl.colormaps[cmap] + else: + cmap = self.plt.cm.get_cmap(cmap) + else: + cmap = None + if color_by_categorical: from matplotlib import colors @@ -1286,7 +1296,10 @@ def _make_plot(self): ax = self.axes[0] # pandas uses colormap, matplotlib uses cmap. cmap = self.colormap or "BuGn" - cmap = self.plt.cm.get_cmap(cmap) + if mpl_ge_3_6_0(): + cmap = mpl.colormaps[cmap] + else: + cmap = self.plt.cm.get_cmap(cmap) cb = self.kwds.pop("colorbar", True) if C is None: diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 2f29aafbdf5cf..d8823c7ec8d3d 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -12,6 +12,7 @@ ) import warnings +import matplotlib as mpl import matplotlib.cm as cm import matplotlib.colors import numpy as np @@ -22,6 +23,8 @@ import pandas.core.common as com +from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 + if TYPE_CHECKING: from matplotlib.colors import Colormap @@ -155,7 +158,10 @@ def _get_cmap_instance(colormap: str | Colormap) -> Colormap: """Get instance of matplotlib colormap.""" if isinstance(colormap, str): cmap = colormap - colormap = cm.get_cmap(colormap) + if mpl_ge_3_6_0(): + colormap = mpl.colormaps[colormap] + else: + colormap = cm.get_cmap(colormap) if colormap is None: raise ValueError(f"Colormap {cmap} is not recognized") return colormap diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 5bd2e8a53e8e8..71209e1598d9a 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -139,22 +139,22 @@ def scatter_matrix( >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> pd.plotting.scatter_matrix(df, alpha=0.2) - array([[, - , - , - ], - [, - , - , - ], - [, - , - , - ], - [, - , - , - ]], dtype=object) + array([[, + , + , + ], + [, + , + , + ], + [, + , + , + ], + [, + , + , + ]], dtype=object) """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.scatter_matrix( @@ -247,7 +247,7 @@ def radviz( ... } ... ) >>> pd.plotting.radviz(df, 'Category') - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.radviz( @@ -311,7 +311,7 @@ def andrews_curves( ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' ... ) >>> pd.plotting.andrews_curves(df, 'Name') - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.andrews_curves( @@ -445,7 +445,7 @@ def parallel_coordinates( >>> pd.plotting.parallel_coordinates( ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464') ... ) - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.parallel_coordinates( @@ -494,7 +494,7 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax >>> x = np.cumsum(np.random.normal(loc=1, scale=5, size=50)) >>> s = pd.Series(x) >>> s.plot() - + A lag plot with ``lag=1`` returns @@ -502,7 +502,7 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax :context: close-figs >>> pd.plotting.lag_plot(s, lag=1) - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds) @@ -536,7 +536,7 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) >>> pd.plotting.autocorrelation_plot(s) - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index b38c9adb0a893..09d310e5ce060 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -32,9 +32,15 @@ from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting +try: + from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 +except ImportError: + mpl_ge_3_6_0 = lambda: True + @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): + @pytest.mark.xfail(mpl_ge_3_6_0(), reason="Api changed") @pytest.mark.slow def test_plot(self): df = tm.makeTimeDataFrame() @@ -733,7 +739,7 @@ def test_plot_scatter_with_c(self): from pandas.plotting._matplotlib.compat import mpl_ge_3_4_0 df = DataFrame( - np.random.randn(6, 4), + np.random.randint(low=0, high=100, size=(6, 4)), index=list(string.ascii_letters[:6]), columns=["x", "y", "z", "four"], ) @@ -1533,8 +1539,8 @@ def test_errorbar_plot_iterator(self): def test_errorbar_with_integer_column_names(self): # test with integer column names - df = DataFrame(np.random.randn(10, 2)) - df_err = DataFrame(np.random.randn(10, 2)) + df = DataFrame(np.abs(np.random.randn(10, 2))) + df_err = DataFrame(np.abs(np.random.randn(10, 2))) ax = _check_plot_works(df.plot, yerr=df_err) self._check_has_errorbars(ax, xerr=0, yerr=2) ax = _check_plot_works(df.plot, y=0, yerr=1) @@ -1542,8 +1548,8 @@ def test_errorbar_with_integer_column_names(self): @pytest.mark.slow def test_errorbar_with_partial_columns(self): - df = DataFrame(np.random.randn(10, 3)) - df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) + df = DataFrame(np.abs(np.random.randn(10, 3))) + df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2]) kinds = ["line", "bar"] for kind in kinds: ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) @@ -1631,9 +1637,11 @@ def test_table(self): assert len(ax.tables) == 1 def test_errorbar_scatter(self): - df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) + df = DataFrame( + np.abs(np.random.randn(5, 2)), index=range(5), columns=["x", "y"] + ) df_err = DataFrame( - np.random.randn(5, 2) / 5, index=range(5), columns=["x", "y"] + np.abs(np.random.randn(5, 2)) / 5, index=range(5), columns=["x", "y"] ) ax = _check_plot_works(df.plot.scatter, x="x", y="y") @@ -1660,7 +1668,9 @@ def _check_errorbar_color(containers, expected, has_err="has_xerr"): ) # GH 8081 - df = DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"]) + df = DataFrame( + np.abs(np.random.randn(10, 5)), columns=["a", "b", "c", "d", "e"] + ) ax = df.plot.scatter(x="a", y="b", xerr="d", yerr="e", c="red") self._check_has_errorbars(ax, xerr=1, yerr=1) _check_errorbar_color(ax.containers, "red", has_err="has_xerr") diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 1cc8381642bbe..e384861d8a57c 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -655,6 +655,6 @@ def test_colors_of_columns_with_same_name(self): def test_invalid_colormap(self): df = DataFrame(np.random.randn(3, 2), columns=["A", "B"]) - msg = "'invalid_colormap' is not a valid value for name; supported values are " - with pytest.raises(ValueError, match=msg): + msg = "(is not a valid value)|(is not a known colormap)" + with pytest.raises((ValueError, KeyError), match=msg): df.plot(colormap="invalid_colormap") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index cb428daac84ba..f75e5cd3491a4 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -39,6 +39,11 @@ from pandas.core.indexes.timedeltas import timedelta_range from pandas.tests.plotting.common import TestPlotBase +try: + from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 +except ImportError: + mpl_ge_3_6_0 = lambda: True + from pandas.tseries.offsets import WeekOfMonth @@ -260,6 +265,7 @@ def test_plot_multiple_inferred_freq(self): ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) + @pytest.mark.xfail(mpl_ge_3_6_0(), reason="Api changed") def test_uhf(self): import pandas.plotting._matplotlib.converter as conv @@ -1209,6 +1215,7 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + @pytest.mark.xfail(mpl_ge_3_6_0(), reason="Api changed") def test_format_date_axis(self): rng = date_range("1/1/2012", periods=12, freq="M") df = DataFrame(np.random.randn(len(rng), 3), rng) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 9c11d589716fe..dc586d15ba115 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -18,6 +18,11 @@ _check_plot_works, ) +try: + from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 +except ImportError: + mpl_ge_3_6_0 = lambda: True + @pytest.fixture def ts(): @@ -191,6 +196,7 @@ def test_hist_kwargs(self, ts): ax = ts.plot.hist(align="left", stacked=True, ax=ax) tm.close() + @pytest.mark.xfail(mpl_ge_3_6_0(), reason="Api changed") @td.skip_if_no_scipy def test_hist_kde(self, ts): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index c49354816b8b0..46b2b827d56b3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -21,6 +21,11 @@ import pandas.plotting as plotting +try: + from pandas.plotting._matplotlib.compat import mpl_ge_3_6_0 +except ImportError: + mpl_ge_3_6_0 = lambda: True + @pytest.fixture def ts(): @@ -493,6 +498,7 @@ def test_kde_missing_vals(self): # gh-14821: check if the values have any missing values assert any(~np.isnan(axes.lines[0].get_xdata())) + @pytest.mark.xfail(mpl_ge_3_6_0(), reason="Api changed") def test_boxplot_series(self, ts): _, ax = self.plt.subplots() ax = ts.plot.box(logy=True, ax=ax) @@ -575,8 +581,10 @@ def test_errorbar_asymmetrical(self): def test_errorbar_plot(self): s = Series(np.arange(10), name="x") - s_err = np.random.randn(10) - d_err = DataFrame(np.random.randn(10, 2), index=s.index, columns=["x", "y"]) + s_err = np.abs(np.random.randn(10)) + d_err = DataFrame( + np.abs(np.random.randn(10, 2)), index=s.index, columns=["x", "y"] + ) # test line and bar plots kinds = ["line", "bar"] for kind in kinds: @@ -597,8 +605,8 @@ def test_errorbar_plot(self): # test time series plotting ix = date_range("1/1/2000", "1/1/2001", freq="M") ts = Series(np.arange(12), index=ix, name="x") - ts_err = Series(np.random.randn(12), index=ix) - td_err = DataFrame(np.random.randn(12, 2), index=ix, columns=["x", "y"]) + ts_err = Series(np.abs(np.random.randn(12)), index=ix) + td_err = DataFrame(np.abs(np.random.randn(12, 2)), index=ix, columns=["x", "y"]) ax = _check_plot_works(ts.plot, yerr=ts_err) self._check_has_errorbars(ax, xerr=0, yerr=1) From f83e2fe3327ad85ae2e8c4ba469fe98383243dbf Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 18 Sep 2022 20:51:33 +0200 Subject: [PATCH 086/136] Backport PR #48623 on branch 1.5.x (REGR/DOC: Docs left navbar broke) (#48625) Backport PR #48623: REGR/DOC: Docs left navbar broke Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/_templates/sidebar-nav-bs.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html index f592a910dfbda..8298b66568e20 100644 --- a/doc/_templates/sidebar-nav-bs.html +++ b/doc/_templates/sidebar-nav-bs.html @@ -1,5 +1,5 @@