diff --git a/.devcontainer.json b/.devcontainer.json
new file mode 100644
index 0000000000000..315a1ff647012
--- /dev/null
+++ b/.devcontainer.json
@@ -0,0 +1,28 @@
+// For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at
+// https://github.com/microsoft/vscode-dev-containers/tree/master/containers/python-3-miniconda
+{
+ "name": "pandas",
+ "context": ".",
+ "dockerFile": "Dockerfile",
+
+ // Use 'settings' to set *default* container specific settings.json values on container create.
+ // You can edit these settings after create using File > Preferences > Settings > Remote.
+ "settings": {
+ "terminal.integrated.shell.linux": "/bin/bash",
+ "python.condaPath": "/opt/conda/bin/conda",
+ "python.pythonPath": "/opt/conda/bin/python",
+ "python.formatting.provider": "black",
+ "python.linting.enabled": true,
+ "python.linting.flake8Enabled": true,
+ "python.linting.pylintEnabled": false,
+ "python.linting.mypyEnabled": true,
+ "python.testing.pytestEnabled": true,
+ "python.testing.cwd": "pandas/tests"
+ },
+
+ // Add the IDs of extensions you want installed when the container is created in the array below.
+ "extensions": [
+ "ms-python.python",
+ "ms-vscode.cpptools"
+ ]
+}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 809764a20a713..139b9e31df46c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,11 +20,11 @@ repos:
rev: v0.730
hooks:
- id: mypy
- # We run mypy over all files because of:
- # * changes in type definitions may affect non-touched files.
- # * Running it with `mypy pandas` and the filenames will lead to
- # spurious duplicate module errors,
- # see also https://github.com/pre-commit/mirrors-mypy/issues/5
- pass_filenames: false
args:
- - pandas
+ # As long as a some files are excluded from check-untyped-defs
+ # we have to exclude it from the pre-commit hook as the configuration
+ # is based on modules but the hook runs on files.
+ - --no-check-untyped-defs
+ - --follow-imports
+ - skip
+ files: pandas/
diff --git a/.travis.yml b/.travis.yml
index a11cd469e9b9c..2c8533d02ddc1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,10 +7,10 @@ python: 3.7
# travis cache --delete inside the project directory from the travis command line client
# The cache directories will be deleted if anything in ci/ changes in a commit
cache:
- ccache: true
- directories:
- - $HOME/.cache # cython cache
- - $HOME/.ccache # compiler cache
+ ccache: true
+ directories:
+ - $HOME/.cache # cython cache
+ - $HOME/.ccache # compiler cache
env:
global:
@@ -20,30 +20,30 @@ env:
- secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA="
git:
- # for cloning
- depth: false
+ # for cloning
+ depth: false
matrix:
- fast_finish: true
- exclude:
- # Exclude the default Python 3.5 build
- - python: 3.5
+ fast_finish: true
- include:
+ include:
- env:
- - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)"
+ - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)"
- env:
- - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)"
+ - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)"
- env:
- - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1"
+ - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1"
services:
- mysql
- postgresql
- env:
- - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1"
+ # Enabling Deprecations when running tests
+ # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs
+ # See pandas/_testing.py for more details.
+ - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1"
services:
- mysql
- postgresql
@@ -73,7 +73,6 @@ before_install:
# This overrides travis and tells it to look nowhere.
- export BOTO_CONFIG=/dev/null
-
install:
- echo "install start"
- ci/prep_cython_cache.sh
@@ -90,5 +89,5 @@ script:
after_script:
- echo "after_script start"
- source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
- - ci/print_skipped.py
+ - ci/print_skipped.py
- echo "after_script done"
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000..b8aff5d671dcf
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,47 @@
+FROM continuumio/miniconda3
+
+# if you forked pandas, you can pass in your own GitHub username to use your fork
+# i.e. gh_username=myname
+ARG gh_username=pandas-dev
+ARG pandas_home="/home/pandas"
+
+# Avoid warnings by switching to noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Configure apt and install packages
+RUN apt-get update \
+ && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
+ #
+ # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
+ && apt-get -y install git iproute2 procps iproute2 lsb-release \
+ #
+ # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill),
+ # needed to build pandas C extensions
+ && apt-get -y install build-essential \
+ #
+ # cleanup
+ && apt-get autoremove -y \
+ && apt-get clean -y \
+ && rm -rf /var/lib/apt/lists/*
+
+# Switch back to dialog for any ad-hoc use of apt-get
+ENV DEBIAN_FRONTEND=dialog
+
+# Clone pandas repo
+RUN mkdir "$pandas_home" \
+ && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \
+ && cd "$pandas_home" \
+ && git remote add upstream "https://github.com/pandas-dev/pandas.git" \
+ && git pull upstream master
+
+# Because it is surprisingly difficult to activate a conda environment inside a DockerFile
+# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89),
+# we just update the base/root one from the 'environment.yml' file instead of creating a new one.
+#
+# Set up environment
+RUN conda env update -n base -f "$pandas_home/environment.yml"
+
+# Build C extensions and pandas
+RUN cd "$pandas_home" \
+ && python setup.py build_ext --inplace -j 4 \
+ && python -m pip install -e .
diff --git a/LICENSE b/LICENSE
index 924de26253bf4..76954a5a339ab 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,8 +1,10 @@
BSD 3-Clause License
-Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
+Copyright (c) 2011-2020, Open source contributors.
+
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index cd1a31d4eaf34..7886b63e9983e 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -43,6 +43,7 @@
"matplotlib": [],
"sqlalchemy": [],
"scipy": [],
+ "numba": [],
"numexpr": [],
"pytables": [null, ""], // platform dependent, see excludes below
"tables": [null, ""],
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
index 441f4b380656e..21081ee23a773 100644
--- a/asv_bench/benchmarks/reshape.py
+++ b/asv_bench/benchmarks/reshape.py
@@ -161,6 +161,9 @@ def time_pivot_table_categorical_observed(self):
observed=True,
)
+ def time_pivot_table_margins_only_column(self):
+ self.df.pivot_table(columns=["key2", "key3"], margins=True)
+
class Crosstab:
def setup(self):
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 7a72622fd5fe3..f7e1e395a76bc 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -44,6 +44,27 @@ def time_rolling(self, constructor, window, dtype, function, raw):
self.roll.apply(function, raw=raw)
+class Engine:
+ params = (
+ ["DataFrame", "Series"],
+ ["int", "float"],
+ [np.sum, lambda x: np.sum(x) + 5],
+ ["cython", "numba"],
+ )
+ param_names = ["constructor", "dtype", "function", "engine"]
+
+ def setup(self, constructor, dtype, function, engine):
+ N = 10 ** 3
+ arr = (100 * np.random.random(N)).astype(dtype)
+ self.data = getattr(pd, constructor)(arr)
+
+ def time_rolling_apply(self, constructor, dtype, function, engine):
+ self.data.rolling(10).apply(function, raw=True, engine=engine)
+
+ def time_expanding_apply(self, constructor, dtype, function, engine):
+ self.data.expanding().apply(function, raw=True, engine=engine)
+
+
class ExpandingMethods:
params = (
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
index 55e8e839f4fae..c9a2e4eefd19d 100644
--- a/ci/azure/posix.yml
+++ b/ci/azure/posix.yml
@@ -18,7 +18,7 @@ jobs:
py36_minimum_versions:
ENV_FILE: ci/deps/azure-36-minimum_versions.yaml
CONDA_PY: "36"
- PATTERN: "not slow and not network"
+ PATTERN: "not slow and not network and not clipboard"
py36_locale_slow_old_np:
ENV_FILE: ci/deps/azure-36-locale_slow.yaml
@@ -36,12 +36,12 @@ jobs:
PATTERN: "not slow and not network"
LANG: "it_IT.utf8"
LC_ALL: "it_IT.utf8"
- EXTRA_APT: "language-pack-it"
+ EXTRA_APT: "language-pack-it xsel"
py36_32bit:
ENV_FILE: ci/deps/azure-36-32bit.yaml
CONDA_PY: "36"
- PATTERN: "not slow and not network"
+ PATTERN: "not slow and not network and not clipboard"
BITS32: "yes"
py37_locale:
@@ -50,7 +50,7 @@ jobs:
PATTERN: "not slow and not network"
LANG: "zh_CN.utf8"
LC_ALL: "zh_CN.utf8"
- EXTRA_APT: "language-pack-zh-hans"
+ EXTRA_APT: "language-pack-zh-hans xsel"
py37_np_dev:
ENV_FILE: ci/deps/azure-37-numpydev.yaml
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 83ceb11dfcbf4..0cc42be42d61e 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -314,8 +314,8 @@ fi
### DOCSTRINGS ###
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
- MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA01, SA02, SA03, SA05)' ; echo $MSG
- $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA01,SA02,SA03,SA05
+ MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03, SA05)' ; echo $MSG
+ $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
index 111ba6b020bc7..dc51597a33209 100644
--- a/ci/deps/azure-37-locale.yaml
+++ b/ci/deps/azure-37-locale.yaml
@@ -34,3 +34,6 @@ dependencies:
- xlsxwriter
- xlwt
- pyarrow>=0.15
+ - pip
+ - pip:
+ - pyxlsb
diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml
index 3bbbdb4cf32ad..90980133b31c1 100644
--- a/ci/deps/azure-macos-36.yaml
+++ b/ci/deps/azure-macos-36.yaml
@@ -33,3 +33,4 @@ dependencies:
- pip
- pip:
- pyreadstat
+ - pyxlsb
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index 62be1075b3337..6b3ad6f560292 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -35,3 +35,6 @@ dependencies:
- xlsxwriter
- xlwt
- pyreadstat
+ - pip
+ - pip:
+ - pyxlsb
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
index a46001c58d165..869d2ab683f0c 100644
--- a/ci/deps/travis-36-cov.yaml
+++ b/ci/deps/travis-36-cov.yaml
@@ -51,3 +51,4 @@ dependencies:
- coverage
- pandas-datareader
- python-dateutil
+ - pyxlsb
diff --git a/ci/print_skipped.py b/ci/print_skipped.py
index 72822fa2d3c7f..60e2f047235e6 100755
--- a/ci/print_skipped.py
+++ b/ci/print_skipped.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
import os
import xml.etree.ElementTree as et
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index 8020680d617d7..0cb1f4aabf352 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -14,14 +14,14 @@ if [ "$COVERAGE" ]; then
COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME"
fi
-PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
-
-# Travis does not have have an X server
-if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
- DISPLAY=DISPLAY=:99.0
- PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD"
+# If no X server is found, we use xvfb to emulate it
+if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then
+ export DISPLAY=":0"
+ XVFB="xvfb-run "
fi
+PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
+
echo $PYTEST_CMD
sh -c "$PYTEST_CMD"
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
index db28eaea8956e..e5bee09fe2f79 100755
--- a/ci/setup_env.sh
+++ b/ci/setup_env.sh
@@ -114,6 +114,11 @@ echo "remove postgres if has been installed with conda"
echo "we use the one from the CI"
conda remove postgresql -y --force || true
+echo
+echo "remove qt"
+echo "causes problems with the clipboard, we use xsel for that"
+conda remove qt -y --force || true
+
echo
echo "conda list pandas"
conda list pandas
diff --git a/doc/make.py b/doc/make.py
index cf73f44b5dd02..024a748cd28ca 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
"""
Python script for building documentation.
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 481c03ab8f388..7f24d02a496e1 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -10,6 +10,7 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
+from datetime import datetime
import importlib
import inspect
import logging
@@ -137,7 +138,7 @@
# General information about the project.
project = "pandas"
-copyright = "2008-2014, the pandas development team"
+copyright = f"2008-{datetime.now().year}, the pandas development team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst
index 2fc2f1fb6ee8d..a295038b5a0bd 100644
--- a/doc/source/development/code_style.rst
+++ b/doc/source/development/code_style.rst
@@ -127,3 +127,29 @@ For example:
value = str
f"Unknown recived type, got: '{type(value).__name__}'"
+
+
+Imports (aim for absolute)
+==========================
+
+In Python 3, absolute imports are recommended. In absolute import doing something
+like ``import string`` will import the string module rather than ``string.py``
+in the same directory. As much as possible, you should try to write out
+absolute imports that show the whole import chain from toplevel pandas.
+
+Explicit relative imports are also supported in Python 3. But it is not
+recommended to use it. Implicit relative imports should never be used
+and is removed in Python 3.
+
+For example:
+
+::
+
+ # preferred
+ import pandas.core.common as com
+
+ # not preferred
+ from .common import test_base
+
+ # wrong
+ from common import test_base
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index 93c65ba7358c9..b650b2a2cf1fe 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -146,6 +146,17 @@ requires a C compiler and Python environment. If you're making documentation
changes, you can skip to :ref:`contributing.documentation` but you won't be able
to build the documentation locally before pushing your changes.
+Using a Docker Container
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Instead of manually setting up a development environment, you can use Docker to
+automatically create the environment with just several commands. Pandas provides a `DockerFile`
+in the root directory to build a Docker image with a full pandas development environment.
+
+Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code,
+a popular free IDE, using the `.devcontainer.json` file.
+See https://code.visualstudio.com/docs/remote/containers for details.
+
.. _contributing.dev_c:
Installing a C compiler
@@ -354,9 +365,9 @@ About the *pandas* documentation
--------------------------------
The documentation is written in **reStructuredText**, which is almost like writing
-in plain English, and built using `Sphinx `__. The
+in plain English, and built using `Sphinx `__. The
Sphinx Documentation has an excellent `introduction to reST
-`__. Review the Sphinx docs to perform more
+`__. Review the Sphinx docs to perform more
complex changes to the documentation as well.
Some other important things to know about the docs:
@@ -635,6 +646,8 @@ many errors as possible, but it may not correct *all* of them. Thus, it is
recommended that you run ``cpplint`` to double check and make any other style
fixes manually.
+.. _contributing.code-formatting:
+
Python (PEP8 / black)
~~~~~~~~~~~~~~~~~~~~~
@@ -656,19 +669,8 @@ apply ``black`` as you edit files.
You should use a ``black`` version >= 19.10b0 as previous versions are not compatible
with the pandas codebase.
-Optionally, you may wish to setup `pre-commit hooks `_
-to automatically run ``black`` and ``flake8`` when you make a git commit. This
-can be done by installing ``pre-commit``::
-
- pip install pre-commit
-
-and then running::
-
- pre-commit install
-
-from the root of the pandas repository. Now ``black`` and ``flake8`` will be run
-each time you commit changes. You can skip these checks with
-``git commit --no-verify``.
+If you wish to run these checks automatically, we encourage you to use
+:ref:`pre-commits ` instead.
One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this
command will catch any stylistic errors in your changes specifically, but
@@ -676,7 +678,7 @@ be beware it may not catch all of them. For example, if you delete the only
usage of an imported function, it is stylistically incorrect to import an
unused function. However, style-checking the diff will not catch this because
the actual import is not part of the diff. Thus, for completeness, you should
-run this command, though it will take longer::
+run this command, though it may take longer::
git diff upstream/master --name-only -- "*.py" | xargs -r flake8
@@ -694,6 +696,8 @@ behaviour as follows::
This will get all the files being changed by the PR (and ending with ``.py``),
and run ``flake8`` on them, one after the other.
+Note that these commands can be run analogously with ``black``.
+
.. _contributing.import-formatting:
Import formatting
@@ -716,7 +720,6 @@ A summary of our current import sections ( in order ):
Imports are alphabetically sorted within these sections.
-
As part of :ref:`Continuous Integration ` checks we run::
isort --recursive --check-only pandas
@@ -740,8 +743,37 @@ to automatically format imports correctly. This will modify your local copy of t
The `--recursive` flag can be passed to sort all files in a directory.
+Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `::
+
+ git diff upstream/master --name-only -- "*.py" | xargs -r isort
+
+Where similar caveats apply if you are on OSX or Windows.
+
You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `.
+.. _contributing.pre-commit:
+
+Pre-Commit
+~~~~~~~~~~
+
+You can run many of these styling checks manually as we have described above. However,
+we encourage you to use `pre-commit hooks `_ instead
+to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This
+can be done by installing ``pre-commit``::
+
+ pip install pre-commit
+
+and then running::
+
+ pre-commit install
+
+from the root of the pandas repository. Now all of the styling checks will be
+run each time you commit changes without your needing to run each one manually.
+In addition, using this pre-commit hook will also allow you to more easily
+remain up-to-date with our code checks as they change.
+
+Note that if needed, you can skip these checks with ``git commit --no-verify``.
+
Backwards compatibility
~~~~~~~~~~~~~~~~~~~~~~~
@@ -1504,3 +1536,19 @@ The branch will still exist on GitHub, so to delete it there do::
git push origin --delete shiny-new-feature
.. _Gitter: https://gitter.im/pydata/pandas
+
+
+Tips for a successful Pull Request
+==================================
+
+If you have made it to the `Review your code`_ phase, one of the core contributors may
+take a look. Please note however that a handful of people are responsible for reviewing
+all of the contributions, which can often lead to bottlenecks.
+
+To improve the chances of your pull request being reviewed, you should:
+
+- **Reference an open issue** for non-trivial changes to clarify the PR's purpose
+- **Ensure you have appropriate tests**. These should be the first part of any PR
+- **Keep your pull requests as simple as possible**. Larger PRs take longer to review
+- **Ensure that CI is in a green state**. Reviewers may not even look otherwise
+- **Keep** `Updating your pull request`_, either by request or every few days
diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst
index 00598830e2fe9..fafe63d80249c 100644
--- a/doc/source/development/roadmap.rst
+++ b/doc/source/development/roadmap.rst
@@ -129,20 +129,6 @@ Some specific goals include
* Improve the overall organization of the documentation and specific subsections
of the documentation to make navigation and finding content easier.
-Package docstring validation
-----------------------------
-
-To improve the quality and consistency of pandas docstrings, we've developed
-tooling to check docstrings in a variety of ways.
-https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py
-contains the checks.
-
-Like many other projects, pandas uses the
-`numpydoc `__ style for writing
-docstrings. With the collaboration of the numpydoc maintainers, we'd like to
-move the checks to a package other than pandas so that other projects can easily
-use them as well.
-
Performance monitoring
----------------------
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index 7bd5ba7ecdf0b..90f839897ce4b 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -41,6 +41,16 @@ Pyjanitor provides a clean API for cleaning data, using method chaining.
Engarde is a lightweight library used to explicitly state assumptions about your datasets
and check that they're *actually* true.
+`pandas-path `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since Python 3.4, `pathlib `_ has been
+included in the Python standard library. Path objects provide a simple
+and delightful way to interact with the file system. The pandas-path package enables the
+Path API for pandas through a custom accessor ``.path``. Getting just the filenames from
+a series of full file paths is as simple as ``my_files.path.name``. Other convenient operations like
+joining paths, replacing file extensions, and checking if files exist are also available.
+
.. _ecosystem.stats:
Statistics and machine learning
@@ -112,16 +122,14 @@ also goes beyond matplotlib and pandas with the option to perform statistical
estimation while plotting, aggregating across observations and visualizing the
fit of statistical models to emphasize patterns in a dataset.
-`yhat/ggpy `__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+`plotnine `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language.
Based on `"The Grammar of Graphics" `__ it
provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data.
-It's really quite incredible. Various implementations to other languages are available,
-but a faithful implementation for Python users has long been missing. Although still young
-(as of Jan-2014), the `yhat/ggpy `__ project has been
-progressing quickly in that direction.
+Various implementations to other languages are available.
+A good implementation for Python users is `has2k1/plotnine `__.
`IPython Vega `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -386,12 +394,16 @@ A directory of projects providing
:ref:`extension accessors `. This is for users to
discover new accessors and for library authors to coordinate on the namespace.
-============== ========== =========================
-Library Accessor Classes
-============== ========== =========================
-`cyberpandas`_ ``ip`` ``Series``
-`pdvega`_ ``vgplot`` ``Series``, ``DataFrame``
-============== ========== =========================
+=============== ========== ========================= ===============================================================
+Library Accessor Classes Description
+=============== ========== ========================= ===============================================================
+`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses.
+`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
+`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series.
+=============== ========== ========================= ===============================================================
.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
.. _pdvega: https://altair-viz.github.io/pdvega/
+.. _Altair: https://altair-viz.github.io/
+.. _pandas_path: https://github.com/drivendataorg/pandas-path/
+.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html
\ No newline at end of file
diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst
index 8bd271815549d..81a2f0ae7d162 100644
--- a/doc/source/getting_started/dsintro.rst
+++ b/doc/source/getting_started/dsintro.rst
@@ -136,7 +136,7 @@ Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`.
This is often a NumPy dtype. However, pandas and 3rd-party libraries
extend NumPy's type system in a few places, in which case the dtype would
-be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within
+be an :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within
pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes`
for more.
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index b3fd443e662a9..8f5900a2a1ba6 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -264,6 +264,7 @@ pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and
pymysql 0.7.11 MySQL engine for sqlalchemy
pyreadstat SPSS files (.sav) reading
pytables 3.4.2 HDF5 reading / writing
+pyxlsb 1.0.6 Reading for xlsb files
qtpy Clipboard I/O
s3fs 0.3.0 Amazon S3 access
tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_)
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index 10705787dfedf..4ced92cbda81a 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library.
:hidden:
{% endif %}
{% if not single_doc %}
- What's New in 1.0.0
+ What's New in 1.1.0
getting_started/index
user_guide/index
{% endif -%}
@@ -51,7 +51,7 @@ See the :ref:`overview` for more detail about what's in the library.
whatsnew/index
{% endif %}
-* :doc:`whatsnew/v1.0.0`
+* :doc:`whatsnew/v1.1.0`
* :doc:`getting_started/index`
* :doc:`getting_started/install`
diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst
index a2150c207c0b0..aeb32db639ffb 100644
--- a/doc/source/user_guide/computation.rst
+++ b/doc/source/user_guide/computation.rst
@@ -348,6 +348,7 @@ Numba will be applied in potentially two routines:
1. If ``func`` is a standard Python function, the engine will `JIT `__
the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
+
2. The engine will JIT the for loop where the apply function is applied to each window.
The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 55bbf6848820b..d0780e4ab8dba 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
text;`JSON `__;:ref:`read_json`;:ref:`to_json`
text;`HTML `__;:ref:`read_html`;:ref:`to_html`
text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard`
- binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel`
+ ;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel`
binary;`OpenDocument `__;:ref:`read_excel`;
binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf`
binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather`
@@ -2768,7 +2768,8 @@ Excel files
The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files
-can be read using either ``xlrd`` or ``openpyxl``.
+can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``)
+files can be read using ``pyxlsb``.
The :meth:`~DataFrame.to_excel` instance method is used for
saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv` data.
@@ -3229,6 +3230,30 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using
Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
is not implemented.
+.. _io.xlsb:
+
+Binary Excel (.xlsb) files
+--------------------------
+
+.. versionadded:: 1.0.0
+
+The :func:`~pandas.read_excel` method can also read binary Excel files
+using the ``pyxlsb`` module. The semantics and features for reading
+binary Excel files mostly match what can be done for `Excel files`_ using
+``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types
+in files and will return floats instead.
+
+.. code-block:: python
+
+ # Returns a DataFrame
+ pd.read_excel('path_to_file.xlsb', engine='pyxlsb')
+
+.. note::
+
+ Currently pandas only supports *reading* binary Excel files. Writing
+ is not implemented.
+
+
.. _io.clipboard:
Clipboard
@@ -4220,46 +4245,49 @@ Compression
all kinds of stores, not just tables. Two parameters are used to
control compression: ``complevel`` and ``complib``.
-``complevel`` specifies if and how hard data is to be compressed.
- ``complevel=0`` and ``complevel=None`` disables
- compression and ``0`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow.
- - `lzo `_: Fast compression and decompression.
- - `bzip2 `_: Good compression rates.
- - `blosc `_: Fast compression and decompression.
-
- Support for alternative blosc compressors:
-
- - `blosc:blosclz `_ This is the
- default compressor for ``blosc``
- - `blosc:lz4
- `_:
- A compact, very popular and fast compressor.
- - `blosc:lz4hc
- `_:
- A tweaked version of LZ4, produces better
- compression ratios at the expense of speed.
- - `blosc:snappy `_:
- A popular compressor used in many places.
- - `blosc:zlib `_: A classic;
- somewhat slower than the previous ones, but
- achieving better compression ratios.
- - `blosc:zstd `_: An
- extremely well balanced codec; it provides the best
- compression ratios among the others above, and at
- reasonably fast speed.
-
- If ``complib`` is defined as something other than the
- listed libraries a ``ValueError`` exception is issued.
+* ``complevel`` specifies if and how hard data is to be compressed.
+ ``complevel=0`` and ``complevel=None`` disables compression and
+ ``0`_: The default compression library.
+ A classic in terms of compression, achieves good compression
+ rates but is somewhat slow.
+ - `lzo `_: Fast
+ compression and decompression.
+ - `bzip2 `_: Good compression rates.
+ - `blosc `_: Fast compression and
+ decompression.
+
+ Support for alternative blosc compressors:
+
+ - `blosc:blosclz `_ This is the
+ default compressor for ``blosc``
+ - `blosc:lz4
+ `_:
+ A compact, very popular and fast compressor.
+ - `blosc:lz4hc
+ `_:
+ A tweaked version of LZ4, produces better
+ compression ratios at the expense of speed.
+ - `blosc:snappy `_:
+ A popular compressor used in many places.
+ - `blosc:zlib `_: A classic;
+ somewhat slower than the previous ones, but
+ achieving better compression ratios.
+ - `blosc:zstd `_: An
+ extremely well balanced codec; it provides the best
+ compression ratios among the others above, and at
+ reasonably fast speed.
+
+ If ``complib`` is defined as something other than the listed libraries a
+ ``ValueError`` exception is issued.
.. note::
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index abbb6feef6056..0f55980b3d015 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -825,14 +825,10 @@ For example, ``pd.NA`` propagates in arithmetic operations, similarly to
There are a few special cases when the result is known, even when one of the
operands is ``NA``.
+.. ipython:: python
-================ ======
-Operation Result
-================ ======
-``pd.NA ** 0`` 0
-``1 ** pd.NA`` 1
-``-1 ** pd.NA`` -1
-================ ======
+ pd.NA ** 0
+ 1 ** pd.NA
In equality and comparison operations, ``pd.NA`` also propagates. This deviates
from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 08b2ae0a4a837..3fdab0fd26643 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -1951,6 +1951,10 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
PeriodIndex partial string indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PeriodIndex now supports partial string slicing with non-monotonic indexes.
+
+.. versionadded:: 1.1.0
+
You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `.
.. ipython:: python
@@ -1981,6 +1985,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa
dfp['2013-01-01 10H':'2013-01-01 11H']
+
Frequency conversion and resampling with PeriodIndex
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq``
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 05c7f72882088..bc463d0ab22d8 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details,
see the commit logs at http://github.com/pandas-dev/pandas. For install and
upgrade instructions, see :ref:`install`.
+Version 1.1
+-----------
+
+.. toctree::
+ :maxdepth: 2
+
+ v1.1.0
+
Version 1.0
-----------
diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst
index f73a3f956f42e..f7f54198a0f82 100644
--- a/doc/source/whatsnew/v0.25.3.rst
+++ b/doc/source/whatsnew/v0.25.3.rst
@@ -19,4 +19,4 @@ Groupby/resample/rolling
Contributors
~~~~~~~~~~~~
-.. contributors:: v0.25.2..HEAD
+.. contributors:: v0.25.2..v0.25.3
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 5f79accc5c679..b06ed684cd525 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -159,14 +159,14 @@ You can use the alias ``"boolean"`` as well.
.. _whatsnew_100.numba_rolling_apply:
-Using Numba in ``rolling.apply``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Using Numba in ``rolling.apply`` and ``expanding.apply``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the
-routine using `Numba `__ instead of Cython. Using the Numba engine
-can yield significant performance gains if the apply function can operate on numpy arrays and
+We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` and :meth:`~core.window.expanding.Expanding.apply`
+that allows the user to execute the routine using `Numba `__ instead of Cython.
+Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and
the data set is larger (1 million rows or greater). For more details, see
-:ref:`rolling apply documentation ` (:issue:`28987`)
+:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`)
.. _whatsnew_100.custom_window:
@@ -215,18 +215,17 @@ Other enhancements
- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
- Roundtripping DataFrames with nullable integer, string and period data types to parquet
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
- now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`).
+ now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
+- :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation `. Closes :issue:`8540`.
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
-- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`)
- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`)
- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`)
- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
-- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
+- Added new writer for exporting Stata dta files in versions 118 and 119, ``StataWriterUTF8``. These files formats support exporting strings containing Unicode characters. Format 119 supports data sets with more than 32,767 variables (:issue:`23573`, :issue:`30959`)
- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
-- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)
- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)
- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`)
@@ -485,6 +484,25 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
a.to_numpy(dtype="float", na_value=np.nan)
+**Reductions can return ``pd.NA``**
+
+When performing a reduction such as a sum with ``skipna=False``, the result
+will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values
+(:issue:`30958`).
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+ >>> pd.Series(a).sum(skipna=False)
+ nan
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+ pd.Series(a).sum(skipna=False)
+
**value_counts returns a nullable integer dtype**
:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable
@@ -707,6 +725,8 @@ Deprecations
- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`)
- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
+- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
+- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
**Selecting Columns from a Grouped DataFrame**
@@ -1112,6 +1132,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`)
- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`)
- Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`)
+- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument where ``min_periods`` was ignored (:issue:`26996`)
Reshaping
^^^^^^^^^
@@ -1177,3 +1198,5 @@ Other
Contributors
~~~~~~~~~~~~
+
+.. contributors:: v0.25.3..v1.0.0rc0
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
new file mode 100644
index 0000000000000..c8e811ce82b1f
--- /dev/null
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -0,0 +1,208 @@
+.. _whatsnew_110:
+
+What's new in 1.1.0 (??)
+------------------------
+
+These are the changes in pandas 1.1.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_110.period_index_partial_string_slicing:
+
+Nonmonotonic PeriodIndex Partial String Slicing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`)
+
+For example:
+
+.. ipython:: python
+
+ dti = pd.date_range("2014-01-01", periods=30, freq="30D")
+ pi = dti.to_period("D")
+ ser_monotonic = pd.Series(np.arange(30), index=pi)
+ shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2))
+ ser = ser_monotonic[shuffler]
+ ser
+
+.. ipython:: python
+
+ ser["2014"]
+ ser.loc["May 2015"]
+
+.. _whatsnew_110.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_110.api.other:
+
+Other API changes
+^^^^^^^^^^^^^^^^^
+
+- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
+ will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_110.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+
+.. _whatsnew_110.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_110.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+
+Categorical
+^^^^^^^^^^^
+
+-
+-
+
+Datetimelike
+^^^^^^^^^^^^
+- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`)
+- :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`)
+-
+
+Timedelta
+^^^^^^^^^
+
+-
+-
+
+Timezones
+^^^^^^^^^
+
+-
+-
+
+
+Numeric
+^^^^^^^
+-
+-
+
+Conversion
+^^^^^^^^^^
+- Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`)
+-
+-
+
+Strings
+^^^^^^^
+
+-
+-
+
+
+Interval
+^^^^^^^^
+
+-
+-
+
+Indexing
+^^^^^^^^
+- Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`)
+-
+-
+
+Missing
+^^^^^^^
+
+-
+-
+
+MultiIndex
+^^^^^^^^^^
+
+-
+-
+
+I/O
+^^^
+
+-
+-
+
+Plotting
+^^^^^^^^
+
+-
+-
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
+
+Reshaping
+^^^^^^^^^
+
+-
+- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`)
+- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in MultiIndexed data (:issue:`19966`)
+- Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`)
+- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`)
+- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`)
+- Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`)
+
+Sparse
+^^^^^^
+
+-
+-
+
+ExtensionArray
+^^^^^^^^^^^^^^
+
+-
+-
+
+
+Other
+^^^^^
+- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True``
+ instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_110.contributors:
+
+Contributors
+~~~~~~~~~~~~
diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py
index fdc5a6b283ba8..f394aac5c545b 100755
--- a/doc/sphinxext/announce.py
+++ b/doc/sphinxext/announce.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# -*- encoding:utf-8 -*-
"""
Script to generate contributor and pull request lists
diff --git a/environment.yml b/environment.yml
index e244350a0bea0..5f1184e921119 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,7 +27,6 @@ dependencies:
# documentation
- gitpython # obtain contributors from git for whatsnew
- sphinx
- - numpydoc>=0.9.0
# documentation (jupyter notebooks)
- nbconvert>=5.4.1
@@ -105,3 +104,4 @@ dependencies:
- tabulate>=0.8.3 # DataFrame.to_markdown
- pip:
- git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
+ - git+https://github.com/numpy/numpydoc
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 491bcb21f245d..d526531b159b2 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -35,8 +35,7 @@
raise ImportError(
f"C extension: {module} not built. If you want to import "
"pandas from the source directory, you may need to run "
- "'python setup.py build_ext --inplace --force' to build "
- "the C extensions first."
+ "'python setup.py build_ext --inplace --force' to build the C extensions first."
)
from pandas._config import (
@@ -198,8 +197,7 @@ def __getattr__(name):
warnings.warn(
"The Panel class is removed from pandas. Accessing it "
- "from the top-level namespace will also be removed in "
- "the next version",
+ "from the top-level namespace will also be removed in the next version",
FutureWarning,
stacklevel=2,
)
@@ -238,8 +236,7 @@ class Panel:
elif name in {"SparseSeries", "SparseDataFrame"}:
warnings.warn(
f"The {name} class is removed from pandas. Accessing it from "
- "the top-level namespace will also be removed in the next "
- "version",
+ "the top-level namespace will also be removed in the next version",
FutureWarning,
stacklevel=2,
)
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 0a3009f74492f..cacd6f5454de7 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -51,7 +51,18 @@
from collections import namedtuple
from contextlib import contextmanager
import re
-from typing import Any, Dict, Iterable, List
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Type,
+ TypeVar,
+ cast,
+)
import warnings
DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver")
@@ -80,7 +91,7 @@ class OptionError(AttributeError, KeyError):
# User API
-def _get_single_key(pat, silent):
+def _get_single_key(pat: str, silent: bool) -> str:
keys = _select_options(pat)
if len(keys) == 0:
if not silent:
@@ -98,7 +109,7 @@ def _get_single_key(pat, silent):
return key
-def _get_option(pat, silent=False):
+def _get_option(pat: str, silent: bool = False):
key = _get_single_key(pat, silent)
# walk the nested dict
@@ -106,7 +117,7 @@ def _get_option(pat, silent=False):
return root[k]
-def _set_option(*args, **kwargs):
+def _set_option(*args, **kwargs) -> None:
# must at least 1 arg deal with constraints later
nargs = len(args)
if not nargs or nargs % 2 != 0:
@@ -138,7 +149,7 @@ def _set_option(*args, **kwargs):
o.cb(key)
-def _describe_option(pat="", _print_desc=True):
+def _describe_option(pat: str = "", _print_desc: bool = True):
keys = _select_options(pat)
if len(keys) == 0:
@@ -154,7 +165,7 @@ def _describe_option(pat="", _print_desc=True):
return s
-def _reset_option(pat, silent=False):
+def _reset_option(pat: str, silent: bool = False) -> None:
keys = _select_options(pat)
@@ -165,15 +176,14 @@ def _reset_option(pat, silent=False):
raise ValueError(
"You must specify at least 4 characters when "
"resetting multiple keys, use the special keyword "
- '"all" to reset all the options to their default '
- "value"
+ '"all" to reset all the options to their default value'
)
for k in keys:
_set_option(k, _registered_options[k].defval, silent=silent)
-def get_default_val(pat):
+def get_default_val(pat: str):
key = _get_single_key(pat, silent=True)
return _get_registered_option(key).defval
@@ -181,11 +191,11 @@ def get_default_val(pat):
class DictWrapper:
""" provide attribute-style access to a nested dict"""
- def __init__(self, d, prefix=""):
+ def __init__(self, d: Dict[str, Any], prefix: str = ""):
object.__setattr__(self, "d", d)
object.__setattr__(self, "prefix", prefix)
- def __setattr__(self, key, val):
+ def __setattr__(self, key: str, val: Any) -> None:
prefix = object.__getattribute__(self, "prefix")
if prefix:
prefix += "."
@@ -211,7 +221,7 @@ def __getattr__(self, key: str):
else:
return _get_option(prefix)
- def __dir__(self):
+ def __dir__(self) -> Iterable[str]:
return list(self.d.keys())
@@ -412,23 +422,31 @@ def __exit__(self, *args):
_set_option(pat, val, silent=True)
-def register_option(key: str, defval: object, doc="", validator=None, cb=None):
- """Register an option in the package-wide pandas config object
+def register_option(
+ key: str,
+ defval: object,
+ doc: str = "",
+ validator: Optional[Callable[[Any], Any]] = None,
+ cb: Optional[Callable[[str], Any]] = None,
+) -> None:
+ """
+ Register an option in the package-wide pandas config object
Parameters
----------
- key - a fully-qualified key, e.g. "x.y.option - z".
- defval - the default value of the option
- doc - a string description of the option
- validator - a function of a single argument, should raise `ValueError` if
- called with a value which is not a legal value for the option.
- cb - a function of a single argument "key", which is called
- immediately after an option value is set/reset. key is
- the full name of the option.
-
- Returns
- -------
- Nothing.
+ key : str
+ Fully-qualified key, e.g. "x.y.option - z".
+ defval : object
+ Default value of the option.
+ doc : str
+ Description of the option.
+ validator : Callable, optional
+ Function of a single argument, should raise `ValueError` if
+ called with a value which is not a legal value for the option.
+ cb
+ a function of a single argument "key", which is called
+ immediately after an option value is set/reset. key is
+ the full name of the option.
Raises
------
@@ -481,7 +499,9 @@ def register_option(key: str, defval: object, doc="", validator=None, cb=None):
)
-def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
+def deprecate_option(
+ key: str, msg: Optional[str] = None, rkey: Optional[str] = None, removal_ver=None
+) -> None:
"""
Mark option `key` as deprecated, if code attempts to access this option,
a warning will be produced, using `msg` if given, or a default message
@@ -494,32 +514,27 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
Parameters
----------
- key - the name of the option to be deprecated. must be a fully-qualified
- option name (e.g "x.y.z.rkey").
-
- msg - (Optional) a warning message to output when the key is referenced.
- if no message is given a default message will be emitted.
-
- rkey - (Optional) the name of an option to reroute access to.
- If specified, any referenced `key` will be re-routed to `rkey`
- including set/get/reset.
- rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
- used by the default message if no `msg` is specified.
-
- removal_ver - (Optional) specifies the version in which this option will
- be removed. used by the default message if no `msg`
- is specified.
-
- Returns
- -------
- Nothing
+ key : str
+ Name of the option to be deprecated.
+ must be a fully-qualified option name (e.g "x.y.z.rkey").
+ msg : str, optional
+ Warning message to output when the key is referenced.
+ if no message is given a default message will be emitted.
+ rkey : str, optional
+ Name of an option to reroute access to.
+ If specified, any referenced `key` will be
+ re-routed to `rkey` including set/get/reset.
+ rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
+ used by the default message if no `msg` is specified.
+ removal_ver : optional
+ Specifies the version in which this option will
+ be removed. used by the default message if no `msg` is specified.
Raises
------
- OptionError - if key has already been deprecated.
-
+ OptionError
+ If the specified key has already been deprecated.
"""
-
key = key.lower()
if key in _deprecated_options:
@@ -532,7 +547,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
# functions internal to the module
-def _select_options(pat):
+def _select_options(pat: str) -> List[str]:
"""returns a list of keys matching `pat`
if pat=="all", returns all registered options
@@ -550,7 +565,7 @@ def _select_options(pat):
return [k for k in keys if re.search(pat, k, re.I)]
-def _get_root(key):
+def _get_root(key: str) -> Tuple[Dict[str, Any], str]:
path = key.split(".")
cursor = _global_config
for p in path[:-1]:
@@ -558,14 +573,14 @@ def _get_root(key):
return cursor, path[-1]
-def _is_deprecated(key):
+def _is_deprecated(key: str) -> bool:
""" Returns True if the given option has been deprecated """
key = key.lower()
return key in _deprecated_options
-def _get_deprecated_option(key):
+def _get_deprecated_option(key: str):
"""
Retrieves the metadata for a deprecated option, if `key` is deprecated.
@@ -582,7 +597,7 @@ def _get_deprecated_option(key):
return d
-def _get_registered_option(key):
+def _get_registered_option(key: str):
"""
Retrieves the option metadata if `key` is a registered option.
@@ -593,7 +608,7 @@ def _get_registered_option(key):
return _registered_options.get(key)
-def _translate_key(key):
+def _translate_key(key: str) -> str:
"""
if key id deprecated and a replacement key defined, will return the
replacement key, otherwise returns `key` as - is
@@ -606,7 +621,7 @@ def _translate_key(key):
return key
-def _warn_if_deprecated(key):
+def _warn_if_deprecated(key: str) -> bool:
"""
Checks if `key` is a deprecated option and if so, prints a warning.
@@ -634,7 +649,7 @@ def _warn_if_deprecated(key):
return False
-def _build_option_description(k):
+def _build_option_description(k: str) -> str:
""" Builds a formatted description of a registered option and prints it """
o = _get_registered_option(k)
@@ -659,7 +674,7 @@ def _build_option_description(k):
return s
-def pp_options_list(keys, width=80, _print=False):
+def pp_options_list(keys: Iterable[str], width=80, _print: bool = False):
""" Builds a concise listing of available options, grouped by prefix """
from textwrap import wrap
@@ -697,6 +712,9 @@ def pp(name: str, ks: Iterable[str]) -> List[str]:
#
# helpers
+FuncType = Callable[..., Any]
+F = TypeVar("F", bound=FuncType)
+
@contextmanager
def config_prefix(prefix):
@@ -728,12 +746,12 @@ def config_prefix(prefix):
global register_option, get_option, set_option, reset_option
- def wrap(func):
- def inner(key, *args, **kwds):
+ def wrap(func: F) -> F:
+ def inner(key: str, *args, **kwds):
pkey = f"{prefix}.{key}"
return func(pkey, *args, **kwds)
- return inner
+ return cast(F, inner)
_register_option = register_option
_get_option = get_option
@@ -751,7 +769,7 @@ def inner(key, *args, **kwds):
# arg in register_option
-def is_type_factory(_type):
+def is_type_factory(_type: Type[Any]) -> Callable[[Any], None]:
"""
Parameters
@@ -765,14 +783,14 @@ def is_type_factory(_type):
"""
- def inner(x):
+ def inner(x) -> None:
if type(x) != _type:
raise ValueError(f"Value must have type '{_type}'")
return inner
-def is_instance_factory(_type):
+def is_instance_factory(_type) -> Callable[[Any], None]:
"""
Parameters
@@ -792,19 +810,19 @@ def is_instance_factory(_type):
else:
type_repr = f"'{_type}'"
- def inner(x):
+ def inner(x) -> None:
if not isinstance(x, _type):
raise ValueError(f"Value must be an instance of {type_repr}")
return inner
-def is_one_of_factory(legal_values):
+def is_one_of_factory(legal_values) -> Callable[[Any], None]:
callables = [c for c in legal_values if callable(c)]
legal_values = [c for c in legal_values if not callable(c)]
- def inner(x):
+ def inner(x) -> None:
if x not in legal_values:
if not any(c(x) for c in callables):
@@ -818,7 +836,7 @@ def inner(x):
return inner
-def is_nonnegative_int(value):
+def is_nonnegative_int(value: Optional[int]) -> None:
"""
Verify that value is None or a positive int.
@@ -853,7 +871,7 @@ def is_nonnegative_int(value):
is_text = is_instance_factory((str, bytes))
-def is_callable(obj):
+def is_callable(obj) -> bool:
"""
Parameters
diff --git a/pandas/_config/display.py b/pandas/_config/display.py
index 067b7c503baab..ef319f4447565 100644
--- a/pandas/_config/display.py
+++ b/pandas/_config/display.py
@@ -1,6 +1,7 @@
"""
Unopinionated display configuration.
"""
+
import locale
import sys
@@ -11,7 +12,7 @@
_initial_defencoding = None
-def detect_console_encoding():
+def detect_console_encoding() -> str:
"""
Try to find the most capable encoding supported by the console.
slightly modified from the way IPython handles the same issue.
diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
index dd1d4948aa6e3..0d68e78372d8a 100644
--- a/pandas/_config/localization.py
+++ b/pandas/_config/localization.py
@@ -12,7 +12,7 @@
@contextmanager
-def set_locale(new_locale, lc_var=locale.LC_ALL):
+def set_locale(new_locale, lc_var: int = locale.LC_ALL):
"""
Context manager for temporarily setting a locale.
@@ -44,7 +44,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL):
locale.setlocale(lc_var, current_locale)
-def can_set_locale(lc, lc_var=locale.LC_ALL):
+def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool:
"""
Check to see if we can set a locale, and subsequently get the locale,
without raising an Exception.
@@ -58,7 +58,7 @@ def can_set_locale(lc, lc_var=locale.LC_ALL):
Returns
-------
- is_valid : bool
+ bool
Whether the passed locale can be set
"""
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 7a2fc9dc7845a..dd1f38ce3a842 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -914,8 +914,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
ranks[argsorted[j]] = i + 1
elif tiebreak == TIEBREAK_FIRST:
if rank_t is object:
- raise ValueError('first not supported for '
- 'non-numeric data')
+ raise ValueError('first not supported for non-numeric data')
else:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = j + 1
@@ -971,8 +970,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
ranks[argsorted[j]] = i + 1
elif tiebreak == TIEBREAK_FIRST:
if rank_t is object:
- raise ValueError('first not supported for '
- 'non-numeric data')
+ raise ValueError('first not supported for non-numeric data')
else:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = j + 1
@@ -1137,8 +1135,7 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, z]] = j + 1
elif tiebreak == TIEBREAK_FIRST:
if rank_t is object:
- raise ValueError('first not supported '
- 'for non-numeric data')
+ raise ValueError('first not supported for non-numeric data')
else:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = z + 1
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index abb8a6d388d26..93ea94f7b18fc 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -686,8 +686,7 @@ def _group_ohlc(floating[:, :] out,
raise ValueError('Output array must have 4 columns')
if K > 1:
- raise NotImplementedError("Argument 'values' must have only "
- "one dimension")
+ raise NotImplementedError("Argument 'values' must have only one dimension")
out[:] = np.nan
with nogil:
diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
index 5298d8c5ed34e..878da670b2f68 100644
--- a/pandas/_libs/hashing.pyx
+++ b/pandas/_libs/hashing.pyx
@@ -51,8 +51,9 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
k = key.encode(encoding)
kb = k
if len(k) != 16:
- raise ValueError("key should be a 16-byte string encoded, "
- f"got {k} (len {len(k)})")
+ raise ValueError(
+ f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
+ )
n = len(arr)
@@ -77,8 +78,10 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
hash(val)
data = str(val).encode(encoding)
else:
- raise TypeError(f"{val} of type {type(val)} is not a valid type "
- "for hashing, must be string or null")
+ raise TypeError(
+ f"{val} of type {type(val)} is not a valid type for hashing, "
+ "must be string or null"
+ )
l = len(data)
lens[i] = l
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 59ba1705d2dbb..884db9ee931d4 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -13,26 +13,45 @@ cnp.import_array()
cdef extern from "numpy/npy_math.h":
float64_t NAN "NPY_NAN"
-
from pandas._libs.khash cimport (
khiter_t,
-
- kh_str_t, kh_init_str, kh_put_str, kh_exist_str,
- kh_get_str, kh_destroy_str, kh_resize_str,
-
- kh_put_strbox, kh_get_strbox, kh_init_strbox,
-
- kh_int64_t, kh_init_int64, kh_resize_int64, kh_destroy_int64,
- kh_get_int64, kh_exist_int64, kh_put_int64,
-
- kh_float64_t, kh_exist_float64, kh_put_float64, kh_init_float64,
- kh_get_float64, kh_destroy_float64, kh_resize_float64,
-
- kh_resize_uint64, kh_exist_uint64, kh_destroy_uint64, kh_put_uint64,
- kh_get_uint64, kh_init_uint64,
-
- kh_destroy_pymap, kh_exist_pymap, kh_init_pymap, kh_get_pymap,
- kh_put_pymap, kh_resize_pymap)
+ kh_str_t,
+ kh_init_str,
+ kh_put_str,
+ kh_exist_str,
+ kh_get_str,
+ kh_destroy_str,
+ kh_resize_str,
+ kh_put_strbox,
+ kh_get_strbox,
+ kh_init_strbox,
+ kh_int64_t,
+ kh_init_int64,
+ kh_resize_int64,
+ kh_destroy_int64,
+ kh_get_int64,
+ kh_exist_int64,
+ kh_put_int64,
+ kh_float64_t,
+ kh_exist_float64,
+ kh_put_float64,
+ kh_init_float64,
+ kh_get_float64,
+ kh_destroy_float64,
+ kh_resize_float64,
+ kh_resize_uint64,
+ kh_exist_uint64,
+ kh_destroy_uint64,
+ kh_put_uint64,
+ kh_get_uint64,
+ kh_init_uint64,
+ kh_destroy_pymap,
+ kh_exist_pymap,
+ kh_init_pymap,
+ kh_get_pymap,
+ kh_put_pymap,
+ kh_resize_pymap,
+)
cimport pandas._libs.util as util
@@ -63,8 +82,9 @@ cdef class Factorizer:
def get_count(self):
return self.count
- def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1,
- na_value=None):
+ def factorize(
+ self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
+ ):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index ac8172146d351..e4ec9db560b80 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -17,8 +17,8 @@ cnp.import_array()
cimport pandas._libs.util as util
-from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
+from pandas._libs.tslibs.c_timestamp cimport _Timestamp
from pandas._libs.hashtable cimport HashTable
@@ -72,9 +72,10 @@ cdef class IndexEngine:
self.over_size_threshold = n >= _SIZE_CUTOFF
self.clear_mapping()
- def __contains__(self, object val):
+ def __contains__(self, val: object) -> bool:
+ # We assume before we get here:
+ # - val is hashable
self._ensure_mapping_populated()
- hash(val)
return val in self.mapping
cpdef get_value(self, ndarray arr, object key, object tz=None):
@@ -85,7 +86,6 @@ cdef class IndexEngine:
"""
cdef:
object loc
- void* data_ptr
loc = self.get_loc(key)
if isinstance(loc, slice) or util.is_array(loc):
@@ -101,7 +101,6 @@ cdef class IndexEngine:
"""
cdef:
object loc
- void* data_ptr
loc = self.get_loc(key)
value = convert_scalar(arr, value)
@@ -215,7 +214,8 @@ cdef class IndexEngine:
return self.monotonic_dec == 1
cdef inline _do_monotonic_check(self):
- cdef object is_unique
+ cdef:
+ bint is_unique
try:
values = self._get_index_values()
self.monotonic_inc, self.monotonic_dec, is_unique = \
@@ -238,10 +238,10 @@ cdef class IndexEngine:
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=False)
- def get_backfill_indexer(self, other, limit=None):
+ def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
return algos.backfill(self._get_index_values(), other, limit=limit)
- def get_pad_indexer(self, other, limit=None):
+ def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
return algos.pad(self._get_index_values(), other, limit=limit)
cdef _make_hash_table(self, Py_ssize_t n):
@@ -409,20 +409,29 @@ cdef class DatetimeEngine(Int64Engine):
cdef _get_box_dtype(self):
return 'M8[ns]'
- def __contains__(self, object val):
+ cdef int64_t _unbox_scalar(self, scalar) except? -1:
+ # NB: caller is responsible for ensuring tzawareness compat
+ # before we get here
+ if not (isinstance(scalar, _Timestamp) or scalar is NaT):
+ raise TypeError(scalar)
+ return scalar.value
+
+ def __contains__(self, val: object) -> bool:
+ # We assume before we get here:
+ # - val is hashable
cdef:
- int64_t loc
+ int64_t loc, conv
+ conv = self._unbox_scalar(val)
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
- return self._get_loc_duplicates(val)
+ return self._get_loc_duplicates(conv)
values = self._get_index_values()
- conv = maybe_datetimelike_to_i8(val)
loc = values.searchsorted(conv, side='left')
return values[loc] == conv
self._ensure_mapping_populated()
- return maybe_datetimelike_to_i8(val) in self.mapping
+ return conv in self.mapping
cdef _get_index_values(self):
return self.vgetter().view('i8')
@@ -431,24 +440,26 @@ cdef class DatetimeEngine(Int64Engine):
return algos.is_monotonic(values, timelike=True)
cpdef get_loc(self, object val):
+ # NB: the caller is responsible for ensuring that we are called
+ # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
+
cdef:
int64_t loc
if is_definitely_invalid_key(val):
raise TypeError
+ try:
+ conv = self._unbox_scalar(val)
+ except TypeError:
+ raise KeyError(val)
+
# Welcome to the spaghetti factory
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
- val = maybe_datetimelike_to_i8(val)
- return self._get_loc_duplicates(val)
+ return self._get_loc_duplicates(conv)
values = self._get_index_values()
- try:
- conv = maybe_datetimelike_to_i8(val)
- loc = values.searchsorted(conv, side='left')
- except TypeError:
- self._date_check_type(val)
- raise KeyError(val)
+ loc = values.searchsorted(conv, side='left')
if loc == len(values) or values[loc] != conv:
raise KeyError(val)
@@ -456,27 +467,12 @@ cdef class DatetimeEngine(Int64Engine):
self._ensure_mapping_populated()
if not self.unique:
- val = maybe_datetimelike_to_i8(val)
- return self._get_loc_duplicates(val)
+ return self._get_loc_duplicates(conv)
try:
- return self.mapping.get_item(val.value)
+ return self.mapping.get_item(conv)
except KeyError:
raise KeyError(val)
- except AttributeError:
- pass
-
- try:
- val = maybe_datetimelike_to_i8(val)
- return self.mapping.get_item(val)
- except (TypeError, ValueError):
- self._date_check_type(val)
- raise KeyError(val)
-
- cdef inline _date_check_type(self, object val):
- hash(val)
- if not util.is_integer_object(val):
- raise KeyError(val)
def get_indexer(self, values):
self._ensure_mapping_populated()
@@ -485,13 +481,13 @@ cdef class DatetimeEngine(Int64Engine):
values = np.asarray(values).view('i8')
return self.mapping.lookup(values)
- def get_pad_indexer(self, other, limit=None):
+ def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
if other.dtype != self._get_box_dtype():
return np.repeat(-1, len(other)).astype('i4')
other = np.asarray(other).view('i8')
return algos.pad(self._get_index_values(), other, limit=limit)
- def get_backfill_indexer(self, other, limit=None):
+ def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
if other.dtype != self._get_box_dtype():
return np.repeat(-1, len(other)).astype('i4')
other = np.asarray(other).view('i8')
@@ -503,22 +499,24 @@ cdef class TimedeltaEngine(DatetimeEngine):
cdef _get_box_dtype(self):
return 'm8[ns]'
+ cdef int64_t _unbox_scalar(self, scalar) except? -1:
+ if not (isinstance(scalar, Timedelta) or scalar is NaT):
+ raise TypeError(scalar)
+ return scalar.value
+
cdef class PeriodEngine(Int64Engine):
cdef _get_index_values(self):
- return super(PeriodEngine, self).vgetter()
-
- cdef void _call_map_locations(self, values):
- # super(...) pattern doesn't seem to work with `cdef`
- Int64Engine._call_map_locations(self, values.view('i8'))
+ return super(PeriodEngine, self).vgetter().view("i8")
cdef _call_monotonic(self, values):
# super(...) pattern doesn't seem to work with `cdef`
return Int64Engine._call_monotonic(self, values.view('i8'))
def get_indexer(self, values):
- cdef ndarray[int64_t, ndim=1] ordinals
+ cdef:
+ ndarray[int64_t, ndim=1] ordinals
super(PeriodEngine, self)._ensure_mapping_populated()
@@ -527,14 +525,14 @@ cdef class PeriodEngine(Int64Engine):
return self.mapping.lookup(ordinals)
- def get_pad_indexer(self, other, limit=None):
+ def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
freq = super(PeriodEngine, self).vgetter().freq
ordinal = periodlib.extract_ordinals(other, freq)
return algos.pad(self._get_index_values(),
np.asarray(ordinal), limit=limit)
- def get_backfill_indexer(self, other, limit=None):
+ def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
freq = super(PeriodEngine, self).vgetter().freq
ordinal = periodlib.extract_ordinals(other, freq)
@@ -717,7 +715,9 @@ cdef class BaseMultiIndexCodesEngine:
return indexer
- def __contains__(self, object val):
+ def __contains__(self, val: object) -> bool:
+ # We assume before we get here:
+ # - val is hashable
# Default __contains__ looks in the underlying mapping, which in this
# case only contains integer representations.
try:
diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in
index 093cca4fe7ed5..cd2b9fbe7d6d6 100644
--- a/pandas/_libs/index_class_helper.pxi.in
+++ b/pandas/_libs/index_class_helper.pxi.in
@@ -53,10 +53,7 @@ cdef class {{name}}Engine(IndexEngine):
ndarray[{{ctype}}] values
int count = 0
- {{if name not in {'Float64', 'Float32'} }}
- if not util.is_integer_object(val):
- raise KeyError(val)
- {{endif}}
+ self._check_type(val)
# A view is needed for some subclasses, such as PeriodEngine:
values = self._get_index_values().view('{{dtype}}')
diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx
index 01f4fb060d982..cdccdb504571c 100644
--- a/pandas/_libs/indexing.pyx
+++ b/pandas/_libs/indexing.pyx
@@ -18,6 +18,7 @@ cdef class _NDFrameIndexerBase:
if ndim is None:
ndim = self._ndim = self.obj.ndim
if ndim > 2:
- raise ValueError("NDFrameIndexer does not support "
- "NDFrame objects with ndim > 2")
+ raise ValueError(
+ "NDFrameIndexer does not support NDFrame objects with ndim > 2"
+ )
return ndim
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 719db5c03f07f..acd74591134bc 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)
+ cdef bint is_valid_null(self, object value) except -1:
+ # We deliberately exclude None / NaN here since StringArray uses NA
+ return value is C_NA
+
cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 26653438356b1..4d17a6f883c1c 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -417,12 +417,12 @@ class NAType(C_NAType):
if other is C_NA:
return NA
elif isinstance(other, (numbers.Number, np.bool_)):
- if other == 1 or other == -1:
+ if other == 1:
return other
else:
return NA
elif isinstance(other, np.ndarray):
- return np.where((other == 1) | (other == -1), other, NA)
+ return np.where(other == 1, other, NA)
return NotImplemented
diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
index ee83901040b36..3a6dd506b2428 100644
--- a/pandas/_libs/sparse.pyx
+++ b/pandas/_libs/sparse.pyx
@@ -72,9 +72,9 @@ cdef class IntIndex(SparseIndex):
"""
if self.npoints > self.length:
- msg = (f"Too many indices. Expected "
- f"{self.length} but found {self.npoints}")
- raise ValueError(msg)
+ raise ValueError(
+ f"Too many indices. Expected {self.length} but found {self.npoints}"
+ )
# Indices are vacuously ordered and non-negative
# if the sequence of indices is empty.
diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c
new file mode 100644
index 0000000000000..fc4bdef8463af
--- /dev/null
+++ b/pandas/_libs/src/ujson/python/date_conversions.c
@@ -0,0 +1,118 @@
+// Conversion routines that are useful for serialization,
+// but which don't interact with JSON objects directly
+
+#include "date_conversions.h"
+#include <../../../tslibs/src/datetime/np_datetime.h>
+#include <../../../tslibs/src/datetime/np_datetime_strings.h>
+
+/*
+ * Function: scaleNanosecToUnit
+ * -----------------------------
+ *
+ * Scales an integer value representing time in nanoseconds to provided unit.
+ *
+ * Mutates the provided value directly. Returns 0 on success, non-zero on error.
+ */
+int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
+ switch (unit) {
+ case NPY_FR_ns:
+ break;
+ case NPY_FR_us:
+ *value /= 1000LL;
+ break;
+ case NPY_FR_ms:
+ *value /= 1000000LL;
+ break;
+ case NPY_FR_s:
+ *value /= 1000000000LL;
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Converts the int64_t representation of a datetime to ISO; mutates len */
+char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
+ npy_datetimestruct dts;
+ int ret_code;
+
+ pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
+
+ *len = (size_t)get_datetime_iso_8601_strlen(0, base);
+ char *result = PyObject_Malloc(*len);
+
+ if (result == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ ret_code = make_iso_8601_datetime(&dts, result, *len, base);
+ if (ret_code != 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ PyObject_Free(result);
+ }
+
+ // Note that get_datetime_iso_8601_strlen just gives a generic size
+ // for ISO string conversion, not the actual size used
+ *len = strlen(result);
+ return result;
+}
+
+npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
+ scaleNanosecToUnit(&dt, base);
+ return dt;
+}
+
+/* Convert PyDatetime To ISO C-string. mutates len */
+char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
+ size_t *len) {
+ npy_datetimestruct dts;
+ int ret;
+
+ ret = convert_pydatetime_to_datetimestruct(obj, &dts);
+ if (ret != 0) {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert PyDateTime to numpy datetime");
+ }
+ return NULL;
+ }
+
+ *len = (size_t)get_datetime_iso_8601_strlen(0, base);
+ char *result = PyObject_Malloc(*len);
+ ret = make_iso_8601_datetime(&dts, result, *len, base);
+
+ if (ret != 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ PyObject_Free(result);
+ return NULL;
+ }
+
+ // Note that get_datetime_iso_8601_strlen just gives a generic size
+ // for ISO string conversion, not the actual size used
+ *len = strlen(result);
+ return result;
+}
+
+npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) {
+ npy_datetimestruct dts;
+ int ret;
+
+ ret = convert_pydatetime_to_datetimestruct(dt, &dts);
+ if (ret != 0) {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert PyDateTime to numpy datetime");
+ }
+ // TODO: is setting errMsg required?
+ //((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ // return NULL;
+ }
+
+ npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
+ return NpyDateTimeToEpoch(npy_dt, base);
+}
diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h
new file mode 100644
index 0000000000000..45455f4d6128b
--- /dev/null
+++ b/pandas/_libs/src/ujson/python/date_conversions.h
@@ -0,0 +1,31 @@
+#ifndef PANDAS__LIBS_SRC_UJSON_DATE_CONVERSIONS
+#define PANDAS__LIBS_SRC_UJSON_DATE_CONVERSIONS
+
+#define PY_SSIZE_T_CLEAN
+#include
+#include
+#include "datetime.h"
+
+// Scales value inplace from nanosecond resolution to unit resolution
+int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
+
+// Converts an int64 object representing a date to ISO format
+// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
+// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
+// len is mutated to save the length of the returned string
+char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
+
+// TODO: this function doesn't do a lot; should augment or replace with
+// scaleNanosecToUnit
+npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
+
+// Converts a Python object representing a Date / Datetime to ISO format
+// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
+// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
+// len is mutated to save the length of the returned string
+char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len);
+
+// Convert a Python Date/Datetime to Unix epoch with resolution base
+npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base);
+
+#endif
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
index c413a16f8d5f0..0367661e5c554 100644
--- a/pandas/_libs/src/ujson/python/objToJSON.c
+++ b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -45,8 +45,7 @@ Numeric decoder derived from from TCL library
#include
#include
#include
-#include <../../../tslibs/src/datetime/np_datetime.h>
-#include <../../../tslibs/src/datetime/np_datetime_strings.h>
+#include "date_conversions.h"
#include "datetime.h"
static PyTypeObject *type_decimal;
@@ -209,34 +208,6 @@ static TypeContext *createTypeContext(void) {
return pc;
}
-/*
- * Function: scaleNanosecToUnit
- * -----------------------------
- *
- * Scales an integer value representing time in nanoseconds to provided unit.
- *
- * Mutates the provided value directly. Returns 0 on success, non-zero on error.
- */
-static int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
- switch (unit) {
- case NPY_FR_ns:
- break;
- case NPY_FR_us:
- *value /= 1000LL;
- break;
- case NPY_FR_ms:
- *value /= 1000000LL;
- break;
- case NPY_FR_s:
- *value /= 1000000000LL;
- break;
- default:
- return -1;
- }
-
- return 0;
-}
-
static PyObject *get_values(PyObject *obj) {
PyObject *values = NULL;
@@ -379,34 +350,6 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
}
-/* Converts the int64_t representation of a datetime to ISO; mutates len */
-static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
- npy_datetimestruct dts;
- int ret_code;
-
- pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
-
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
-
- if (result == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
-
- ret_code = make_iso_8601_datetime(&dts, result, *len, base);
- if (ret_code != 0) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- }
-
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
-}
-
/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
@@ -414,50 +357,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
return int64ToIso(GET_TC(tc)->longValue, base, len);
}
-static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
- scaleNanosecToUnit(&dt, base);
- return dt;
-}
-
-/* Convert PyDatetime To ISO C-string. mutates len */
-static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
- size_t *len) {
- npy_datetimestruct dts;
- int ret;
-
- ret = convert_pydatetime_to_datetimestruct(obj, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- return NULL;
- }
-
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
- ret = make_iso_8601_datetime(&dts, result, *len, base);
-
- if (ret != 0) {
- PRINTMARK();
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- return NULL;
- }
-
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
-}
-
/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
size_t *len) {
- if (!PyDateTime_Check(obj)) {
- PyErr_SetString(PyExc_TypeError, "Expected datetime object");
+ if (!PyDate_Check(obj)) {
+ PyErr_SetString(PyExc_TypeError, "Expected date object");
return NULL;
}
@@ -465,30 +370,6 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
return PyDateTimeToIso(obj, base, len);
}
-static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) {
- npy_datetimestruct dts;
- int ret;
-
- if (!PyDateTime_Check(obj)) {
- // TODO: raise TypeError
- }
- PyDateTime_Date *dt = (PyDateTime_Date *)obj;
-
- ret = convert_pydatetime_to_datetimestruct(dt, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- // TODO: is setting errMsg required?
- //((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- // return NULL;
- }
-
- npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
- return NpyDateTimeToEpoch(npy_dt, base);
-}
-
static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) {
PyObject *obj = (PyObject *)_obj;
PyObject *str;
@@ -1504,6 +1385,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
char **ret;
char *dataptr, *cLabel;
int type_num;
+ NPY_DATETIMEUNIT base = enc->datetimeUnit;
PRINTMARK();
if (!labels) {
@@ -1541,32 +1423,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
break;
}
- // TODO: vectorized timedelta solution
- if (enc->datetimeIso &&
- (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) {
- PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
- if (td == NULL) {
- Py_DECREF(item);
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
-
- PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL);
- Py_DECREF(td);
- if (iso == NULL) {
- Py_DECREF(item);
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
-
- cLabel = (char *)PyUnicode_AsUTF8(iso);
- Py_DECREF(iso);
- len = strlen(cLabel);
- } else if (PyTypeNum_ISDATETIME(type_num)) {
- NPY_DATETIMEUNIT base = enc->datetimeUnit;
- npy_int64 longVal;
+ int is_datetimelike = 0;
+ npy_int64 nanosecVal;
+ if (PyTypeNum_ISDATETIME(type_num)) {
+ is_datetimelike = 1;
PyArray_VectorUnaryFunc *castfunc =
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
if (!castfunc) {
@@ -1574,27 +1434,74 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
"Cannot cast numpy dtype %d to long",
enc->npyType);
}
- castfunc(dataptr, &longVal, 1, NULL, NULL);
- if (enc->datetimeIso) {
- cLabel = int64ToIso(longVal, base, &len);
+ castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
+ } else if (PyDate_Check(item) || PyDelta_Check(item)) {
+ is_datetimelike = 1;
+ if (PyObject_HasAttrString(item, "value")) {
+ nanosecVal = get_long_attr(item, "value");
} else {
- if (!scaleNanosecToUnit(&longVal, base)) {
- // TODO: This gets hit but somehow doesn't cause errors
- // need to clean up (elsewhere in module as well)
+ if (PyDelta_Check(item)) {
+ nanosecVal = total_seconds(item) *
+ 1000000000LL; // nanoseconds per second
+ } else {
+ // datetime.* objects don't follow above rules
+ nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
}
- cLabel = PyObject_Malloc(21); // 21 chars for int64
- sprintf(cLabel, "%" NPY_INT64_FMT, longVal);
- len = strlen(cLabel);
}
- } else if (PyDateTime_Check(item) || PyDate_Check(item)) {
- NPY_DATETIMEUNIT base = enc->datetimeUnit;
- if (enc->datetimeIso) {
- cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len);
+ }
+
+ if (is_datetimelike) {
+ if (nanosecVal == get_nat()) {
+ len = 5; // TODO: shouldn't require extra space for terminator
+ cLabel = PyObject_Malloc(len);
+ strncpy(cLabel, "null", len);
} else {
- cLabel = PyObject_Malloc(21); // 21 chars for int64
- sprintf(cLabel, "%" NPY_DATETIME_FMT,
- PyDateTimeToEpoch(item, base));
- len = strlen(cLabel);
+ if (enc->datetimeIso) {
+ // TODO: Vectorized Timedelta function
+ if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
+ PyObject *td =
+ PyObject_CallFunction(cls_timedelta, "(O)", item);
+ if (td == NULL) {
+ Py_DECREF(item);
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+
+ PyObject *iso =
+ PyObject_CallMethod(td, "isoformat", NULL);
+ Py_DECREF(td);
+ if (iso == NULL) {
+ Py_DECREF(item);
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+
+ len = strlen(PyUnicode_AsUTF8(iso));
+ cLabel = PyObject_Malloc(len + 1);
+ memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1);
+ Py_DECREF(iso);
+ } else {
+ if (type_num == NPY_DATETIME) {
+ cLabel = int64ToIso(nanosecVal, base, &len);
+ } else {
+ cLabel = PyDateTimeToIso((PyDateTime_Date *)item,
+ base, &len);
+ }
+ }
+ if (cLabel == NULL) {
+ Py_DECREF(item);
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+ } else {
+ cLabel = PyObject_Malloc(21); // 21 chars for int64
+ sprintf(cLabel, "%" NPY_DATETIME_FMT,
+ NpyDateTimeToEpoch(nanosecVal, base));
+ len = strlen(cLabel);
+ }
}
} else { // Fallback to string representation
PyObject *str = PyObject_Str(item);
@@ -1615,6 +1522,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
ret[i] = PyObject_Malloc(len + 1);
memcpy(ret[i], cLabel, len + 1);
+ if (is_datetimelike) {
+ PyObject_Free(cLabel);
+ }
+
if (PyErr_Occurred()) {
NpyArr_freeLabels(ret, num);
ret = 0;
@@ -1784,7 +1695,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
+ GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
tc->type = JT_LONG;
}
return;
@@ -1810,7 +1721,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
+ GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
tc->type = JT_LONG;
}
return;
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index 5a30b71a6fea1..0e57b563d4d25 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -127,9 +127,9 @@ cpdef assert_almost_equal(a, b,
# classes can't be the same, to raise error
assert_class_equal(a, b, obj=obj)
- assert has_length(a) and has_length(b), ("Can't compare objects without "
- "length, one or both is invalid: "
- f"({a}, {b})")
+ assert has_length(a) and has_length(b), (
+ f"Can't compare objects without length, one or both is invalid: ({a}, {b})"
+ )
if a_is_ndarray and b_is_ndarray:
na, nb = a.size, b.size
diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx
index 6e6b809b9b5a6..ed1df5f4fa595 100644
--- a/pandas/_libs/tslibs/c_timestamp.pyx
+++ b/pandas/_libs/tslibs/c_timestamp.pyx
@@ -57,11 +57,12 @@ def integer_op_not_supported(obj):
# the caller; mypy finds this more palatable.
cls = type(obj).__name__
+ # GH#30886 using an fstring raises SystemError
int_addsub_msg = (
- f"Addition/subtraction of integers and integer-arrays with {cls} is "
+ "Addition/subtraction of integers and integer-arrays with {cls} is "
"no longer supported. Instead of adding/subtracting `n`, "
"use `n * obj.freq`"
- )
+ ).format(cls=cls)
return TypeError(int_addsub_msg)
diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd
index 36e6b14be182a..d4ae3fa8c5b99 100644
--- a/pandas/_libs/tslibs/conversion.pxd
+++ b/pandas/_libs/tslibs/conversion.pxd
@@ -25,6 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1
cpdef int64_t pydt_to_i8(object pydt) except? -1
-cdef maybe_datetimelike_to_i8(object val)
-
cpdef datetime localize_pydatetime(datetime dt, object tz)
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index 2988d7bae9a5e..77f46016ee846 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -29,7 +29,7 @@ from pandas._libs.tslibs.util cimport (
from pandas._libs.tslibs.timedeltas cimport cast_from_unit
from pandas._libs.tslibs.timezones cimport (
is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info,
- get_timezone, maybe_get_tz, tz_compare)
+ get_timezone, maybe_get_tz, tz_compare, treat_tz_as_dateutil)
from pandas._libs.tslibs.timezones import UTC
from pandas._libs.tslibs.parsing import parse_datetime_string
@@ -99,6 +99,11 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True):
shape = (