From ac281d601e78c57abbc37f6da940bbcb001b4d62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 23:23:38 +0200
Subject: [PATCH 1/4] Add docstring to the insertion method & fix #21364
Credit for empty result documentation goes to @MagnetarAlex
---
doc/source/user_guide/io.rst | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 4aacb6fa1e278..9ce9bec1441d9 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5012,6 +5012,18 @@ Example of a callable using PostgreSQL `COPY clause
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
+ """
+ Execute SQL statement inserting data
+
+ Parameters
+ ----------
+ table : pandas.io.sql.SQLTable
+ conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
+ keys : list of str
+ Column names
+ data_iter : generator of list
+ Each item contains a list of values to be inserted
+ """
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
@@ -5045,6 +5057,16 @@ table name and optionally a subset of columns to read.
pd.read_sql_table('data', engine)
+Note that pandas infers column dtypes from query outputs, and not by looking
+up data types in the physical database schema. For example, assume ``userid``
+is an integer column in a table. Then, intuitively, ``select userid ...`` will
+return integer-valued series, while ``select cast(userid as text) ...`` will
+return object-valued (str) series. Accordingly, if the query output is empty,
+then all resulting columns will be returned as object-valued (since they are
+most general). If you foresee that your query will sometimes generate an empty
+result, you may want to explicitly typecast afterwards to ensure dtype
+integrity.
+
You can also specify the name of the column as the ``DataFrame`` index,
and specify a subset of columns to be read.
From 9b575816ebc6a836683e09ebebe6b77ee36f01c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 29 Aug 2019 19:22:16 +0200
Subject: [PATCH 2/4] Generator -> iterable & add note section
---
doc/source/user_guide/io.rst | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 9ce9bec1441d9..838018eda0a34 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5021,8 +5021,7 @@ Example of a callable using PostgreSQL `COPY clause
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
- data_iter : generator of list
- Each item contains a list of values to be inserted
+ data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
@@ -5057,15 +5056,17 @@ table name and optionally a subset of columns to read.
pd.read_sql_table('data', engine)
-Note that pandas infers column dtypes from query outputs, and not by looking
-up data types in the physical database schema. For example, assume ``userid``
-is an integer column in a table. Then, intuitively, ``select userid ...`` will
-return integer-valued series, while ``select cast(userid as text) ...`` will
-return object-valued (str) series. Accordingly, if the query output is empty,
-then all resulting columns will be returned as object-valued (since they are
-most general). If you foresee that your query will sometimes generate an empty
-result, you may want to explicitly typecast afterwards to ensure dtype
-integrity.
+.. note::
+
+ Note that pandas infers column dtypes from query outputs, and not by looking
+ up data types in the physical database schema. For example, assume ``userid``
+ is an integer column in a table. Then, intuitively, ``select userid ...`` will
+ return integer-valued series, while ``select cast(userid as text) ...`` will
+ return object-valued (str) series. Accordingly, if the query output is empty,
+ then all resulting columns will be returned as object-valued (since they are
+ most general). If you foresee that your query will sometimes generate an empty
+ result, you may want to explicitly typecast afterwards to ensure dtype
+ integrity.
You can also specify the name of the column as the ``DataFrame`` index,
and specify a subset of columns to be read.
From 7b1a7964377dd60b22ec6af31a7e6c69b48ca4c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 29 Aug 2019 19:29:15 +0200
Subject: [PATCH 3/4] merge origin/master
---
.github/FUNDING.yml | 1 +
.github/PULL_REQUEST_TEMPLATE.md | 1 +
.github/SECURITY.md | 1 +
.gitignore | 3 +
.pre-commit-config.yaml | 17 +
.travis.yml | 20 +-
LICENSES/HAVEN_LICENSE | 2 +
LICENSES/HAVEN_MIT | 32 +
Makefile | 10 +-
README.md | 4 +-
asv_bench/asv.conf.json | 2 +-
asv_bench/benchmarks/algorithms.py | 146 +-
asv_bench/benchmarks/attrs_caching.py | 6 +-
asv_bench/benchmarks/binary_ops.py | 51 +-
asv_bench/benchmarks/categoricals.py | 125 +-
asv_bench/benchmarks/ctors.py | 55 +-
asv_bench/benchmarks/dtypes.py | 32 +-
asv_bench/benchmarks/eval.py | 31 +-
asv_bench/benchmarks/frame_ctor.py | 26 +-
asv_bench/benchmarks/frame_methods.py | 249 +-
asv_bench/benchmarks/gil.py | 136 +-
asv_bench/benchmarks/groupby.py | 515 +-
.../benchmarks/index_cached_properties.py | 75 +
asv_bench/benchmarks/index_object.py | 101 +-
asv_bench/benchmarks/indexing.py | 186 +-
asv_bench/benchmarks/indexing_engines.py | 59 +-
asv_bench/benchmarks/inference.py | 75 +-
asv_bench/benchmarks/io/csv.py | 322 +-
asv_bench/benchmarks/io/excel.py | 18 +-
asv_bench/benchmarks/io/hdf.py | 106 +-
asv_bench/benchmarks/io/json.py | 240 +-
asv_bench/benchmarks/io/msgpack.py | 17 +-
asv_bench/benchmarks/io/parsers.py | 22 +-
asv_bench/benchmarks/io/pickle.py | 13 +-
asv_bench/benchmarks/io/sas.py | 20 +-
asv_bench/benchmarks/io/sql.py | 161 +-
asv_bench/benchmarks/io/stata.py | 38 +-
asv_bench/benchmarks/join_merge.py | 274 +-
asv_bench/benchmarks/multiindex_object.py | 86 +-
asv_bench/benchmarks/offset.py | 75 +-
asv_bench/benchmarks/pandas_vb_common.py | 39 +-
asv_bench/benchmarks/period.py | 92 +-
asv_bench/benchmarks/plotting.py | 47 +-
asv_bench/benchmarks/reindex.py | 70 +-
asv_bench/benchmarks/replace.py | 46 +-
asv_bench/benchmarks/reshape.py | 166 +-
asv_bench/benchmarks/rolling.py | 80 +-
asv_bench/benchmarks/series_methods.py | 149 +-
asv_bench/benchmarks/sparse.py | 73 +-
asv_bench/benchmarks/stat_ops.py | 47 +-
asv_bench/benchmarks/strings.py | 84 +-
asv_bench/benchmarks/timedelta.py | 59 +-
asv_bench/benchmarks/timeseries.py | 215 +-
asv_bench/benchmarks/timestamp.py | 35 +-
azure-pipelines.yml | 85 +-
ci/azure/posix.yml | 18 +-
ci/azure/windows.yml | 18 +-
ci/build_docs.sh | 56 -
ci/check_git_tags.sh | 28 +
ci/code_checks.sh | 17 +-
ci/deps/azure-35-compat.yaml | 18 +-
ci/deps/azure-36-32bit.yaml | 20 +
ci/deps/azure-36-locale.yaml | 16 +-
ci/deps/azure-36-locale_slow.yaml | 2 +-
ci/deps/azure-37-locale.yaml | 8 +-
ci/deps/azure-37-numpydev.yaml | 6 +-
ci/deps/azure-macos-35.yaml | 12 +-
ci/deps/azure-windows-36.yaml | 6 +-
ci/deps/azure-windows-37.yaml | 9 +-
ci/deps/travis-36-cov.yaml | 8 +-
ci/deps/travis-36-doc.yaml | 47 -
ci/deps/travis-36-locale.yaml | 21 +-
ci/deps/travis-36-slow.yaml | 5 +-
ci/deps/travis-37.yaml | 6 +-
ci/print_skipped.py | 46 +-
ci/run_tests.sh | 11 +-
ci/setup_env.sh | 6 +
codecov.yml | 4 +-
doc/logo/pandas_logo.py | 16 +-
doc/make.py | 275 +-
doc/source/conf.py | 409 +-
doc/source/development/contributing.rst | 232 +-
.../development/contributing_docstring.rst | 4 +-
doc/source/development/developer.rst | 60 +-
doc/source/development/extending.rst | 77 +-
doc/source/development/index.rst | 1 +
doc/source/development/internals.rst | 2 +-
doc/source/development/roadmap.rst | 193 +
doc/source/ecosystem.rst | 19 +-
doc/source/getting_started/10min.rst | 44 +-
doc/source/getting_started/basics.rst | 61 +-
.../comparison/comparison_with_r.rst | 14 +-
.../comparison/comparison_with_sas.rst | 38 +-
.../comparison/comparison_with_stata.rst | 46 +-
doc/source/getting_started/dsintro.rst | 67 +-
doc/source/getting_started/overview.rst | 10 +-
doc/source/getting_started/tutorials.rst | 8 +-
doc/source/index.rst.template | 10 +-
doc/source/install.rst | 189 +-
doc/source/reference/arrays.rst | 48 +-
doc/source/reference/extensions.rst | 39 +
doc/source/reference/frame.rst | 15 +-
doc/source/reference/groupby.rst | 2 +-
doc/source/reference/index.rst | 2 +-
doc/source/reference/indexing.rst | 42 +-
doc/source/reference/io.rst | 11 +-
doc/source/reference/offset_frequency.rst | 2 +-
doc/source/reference/resampling.rst | 2 +-
doc/source/reference/series.rst | 30 +-
doc/source/reference/style.rst | 10 +-
doc/source/reference/window.rst | 7 +-
.../themes/nature_with_gtoc/layout.html | 18 +-
doc/source/user_guide/advanced.rst | 90 +-
doc/source/user_guide/categorical.rst | 30 +-
doc/source/user_guide/computation.rst | 36 +-
doc/source/user_guide/cookbook.rst | 35 +-
doc/source/user_guide/enhancingperf.rst | 48 +-
doc/source/user_guide/gotchas.rst | 4 +-
doc/source/user_guide/groupby.rst | 54 +-
doc/source/user_guide/indexing.rst | 120 +-
doc/source/user_guide/integer_na.rst | 2 +-
doc/source/user_guide/io.rst | 328 +-
doc/source/user_guide/merging.rst | 28 +-
doc/source/user_guide/missing_data.rst | 34 +-
doc/source/user_guide/options.rst | 44 +-
doc/source/user_guide/reshaping.rst | 80 +-
doc/source/user_guide/sparse.rst | 10 +-
doc/source/user_guide/style.ipynb | 16 +-
doc/source/user_guide/text.rst | 29 +-
doc/source/user_guide/timedeltas.rst | 6 +-
doc/source/user_guide/timeseries.rst | 195 +-
doc/source/user_guide/visualization.rst | 75 +-
doc/source/whatsnew/index.rst | 18 +
doc/source/whatsnew/v0.10.0.rst | 8 +-
doc/source/whatsnew/v0.10.1.rst | 4 +-
doc/source/whatsnew/v0.11.0.rst | 237 +-
doc/source/whatsnew/v0.12.0.rst | 8 +-
doc/source/whatsnew/v0.13.0.rst | 62 +-
doc/source/whatsnew/v0.13.1.rst | 6 +-
doc/source/whatsnew/v0.14.0.rst | 16 +-
doc/source/whatsnew/v0.14.1.rst | 4 +-
doc/source/whatsnew/v0.15.0.rst | 27 +-
doc/source/whatsnew/v0.15.1.rst | 2 +-
doc/source/whatsnew/v0.15.2.rst | 15 +-
doc/source/whatsnew/v0.16.0.rst | 48 +-
doc/source/whatsnew/v0.16.1.rst | 14 +-
doc/source/whatsnew/v0.16.2.rst | 8 +-
doc/source/whatsnew/v0.17.0.rst | 54 +-
doc/source/whatsnew/v0.17.1.rst | 12 +-
doc/source/whatsnew/v0.18.0.rst | 34 +-
doc/source/whatsnew/v0.18.1.rst | 20 +-
doc/source/whatsnew/v0.19.0.rst | 20 +-
doc/source/whatsnew/v0.19.1.rst | 4 +-
doc/source/whatsnew/v0.19.2.rst | 4 +-
doc/source/whatsnew/v0.20.0.rst | 104 +-
doc/source/whatsnew/v0.20.2.rst | 6 +-
doc/source/whatsnew/v0.20.3.rst | 2 +-
doc/source/whatsnew/v0.21.0.rst | 68 +-
doc/source/whatsnew/v0.21.1.rst | 10 +-
doc/source/whatsnew/v0.22.0.rst | 6 +-
doc/source/whatsnew/v0.23.0.rst | 78 +-
doc/source/whatsnew/v0.23.1.rst | 16 +-
doc/source/whatsnew/v0.23.2.rst | 12 +-
doc/source/whatsnew/v0.23.3.rst | 2 +-
doc/source/whatsnew/v0.23.4.rst | 10 +-
doc/source/whatsnew/v0.24.0.rst | 134 +-
doc/source/whatsnew/v0.24.1.rst | 12 +-
doc/source/whatsnew/v0.24.2.rst | 10 +-
doc/source/whatsnew/v0.25.0.rst | 681 ++-
doc/source/whatsnew/v0.25.1.rst | 119 +
doc/source/whatsnew/v0.25.2.rst | 109 +
doc/source/whatsnew/v0.4.x.rst | 4 +-
doc/source/whatsnew/v0.5.0.rst | 4 +-
doc/source/whatsnew/v0.6.0.rst | 4 +-
doc/source/whatsnew/v0.7.0.rst | 4 +-
doc/source/whatsnew/v0.7.3.rst | 10 +-
doc/source/whatsnew/v0.8.0.rst | 2 +-
doc/source/whatsnew/v1.0.0.rst | 217 +
doc/sphinxext/announce.py | 48 +-
doc/sphinxext/contributors.py | 25 +-
environment.yml | 11 +-
mypy.ini | 9 -
pandas/__init__.py | 207 +-
pandas/_config/__init__.py | 21 +-
pandas/_config/config.py | 153 +-
pandas/_config/dates.py | 12 +-
pandas/_config/display.py | 11 +-
pandas/_config/localization.py | 22 +-
pandas/_libs/__init__.py | 9 +-
pandas/_libs/algos.pyx | 25 -
pandas/_libs/algos_take_helper.pxi.in | 8 +-
pandas/_libs/groupby.pyx | 7 +-
pandas/_libs/groupby_helper.pxi.in | 12 +-
pandas/_libs/hashtable.pxd | 2 +-
pandas/_libs/hashtable.pyx | 4 +-
pandas/_libs/hashtable_class_helper.pxi.in | 10 +-
pandas/_libs/hashtable_func_helper.pxi.in | 2 +-
pandas/_libs/index.pyx | 17 +-
pandas/_libs/internals.pyx | 2 +-
pandas/_libs/interval.pyx | 53 +
pandas/_libs/join.pyx | 23 +-
pandas/_libs/lib.pxd | 1 +
pandas/_libs/lib.pyx | 184 +-
pandas/_libs/missing.pyx | 7 +-
pandas/_libs/ops.pyx | 4 +-
pandas/_libs/parsers.pyx | 29 +-
pandas/_libs/reduction.pyx | 2 +-
pandas/_libs/reshape.pyx | 63 +-
pandas/_libs/src/klib/khash_python.h | 2 +-
pandas/_libs/src/parser/tokenizer.c | 43 +-
pandas/_libs/src/parser/tokenizer.h | 18 +-
pandas/_libs/src/ujson/lib/ultrajson.h | 7 -
pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 +
pandas/_libs/src/ujson/python/objToJSON.c | 381 +-
pandas/_libs/tslib.pyx | 22 +-
pandas/_libs/tslibs/__init__.py | 6 +-
pandas/_libs/tslibs/c_timestamp.pyx | 18 +-
pandas/_libs/tslibs/conversion.pyx | 9 +
pandas/_libs/tslibs/fields.pyx | 12 +-
pandas/_libs/tslibs/nattype.pyx | 99 +-
pandas/_libs/tslibs/parsing.pyx | 13 +-
pandas/_libs/tslibs/period.pyx | 13 +-
pandas/_libs/tslibs/timedeltas.pyx | 58 +-
pandas/_libs/tslibs/tzconversion.pyx | 4 +
pandas/_libs/tslibs/util.pxd | 3 +-
pandas/_libs/window.pyx | 92 +-
pandas/_typing.py | 41 +-
pandas/_version.py | 123 +-
pandas/api/__init__.py | 2 +-
pandas/api/extensions/__init__.py | 19 +-
pandas/api/types/__init__.py | 17 +-
pandas/arrays/__init__.py | 25 +-
pandas/compat/__init__.py | 57 +-
pandas/compat/_optional.py | 111 +
pandas/compat/chainmap.py | 7 -
pandas/compat/numpy/__init__.py | 52 +-
pandas/compat/numpy/function.py | 292 +-
pandas/compat/pickle_compat.py | 141 +-
pandas/conftest.py | 380 +-
pandas/core/accessor.py | 66 +-
pandas/core/algorithms.py | 629 +-
pandas/core/api.py | 23 +-
pandas/core/apply.py | 164 +-
pandas/core/arrays/__init__.py | 26 +-
pandas/core/arrays/_ranges.py | 73 +-
pandas/core/arrays/array_.py | 276 -
pandas/core/arrays/base.py | 249 +-
pandas/core/arrays/categorical.py | 843 +--
pandas/core/arrays/datetimelike.py | 494 +-
pandas/core/arrays/datetimes.py | 755 ++-
pandas/core/arrays/integer.py | 385 +-
pandas/core/arrays/interval.py | 471 +-
pandas/core/arrays/numpy_.py | 209 +-
pandas/core/arrays/period.py | 351 +-
pandas/core/arrays/sparse.py | 602 +-
pandas/core/arrays/timedeltas.py | 274 +-
pandas/core/base.py | 422 +-
pandas/core/common.py | 85 +-
pandas/core/computation/align.py | 40 +-
pandas/core/computation/check.py | 30 +-
pandas/core/computation/common.py | 4 +-
pandas/core/computation/engines.py | 37 +-
pandas/core/computation/eval.py | 114 +-
pandas/core/computation/expr.py | 356 +-
pandas/core/computation/expressions.py | 157 +-
pandas/core/computation/ops.py | 247 +-
pandas/core/computation/pytables.py | 213 +-
pandas/core/computation/scope.py | 103 +-
pandas/core/config_init.py | 362 +-
pandas/core/construction.py | 549 ++
pandas/core/dtypes/api.py | 55 +-
pandas/core/dtypes/base.py | 34 +-
pandas/core/dtypes/cast.py | 528 +-
pandas/core/dtypes/common.py | 349 +-
pandas/core/dtypes/concat.py | 272 +-
pandas/core/dtypes/dtypes.py | 318 +-
pandas/core/dtypes/generic.py | 126 +-
pandas/core/dtypes/inference.py | 82 +-
pandas/core/dtypes/missing.py | 139 +-
pandas/core/frame.py | 2783 +++++----
pandas/core/generic.py | 3952 +++++++-----
pandas/core/groupby/__init__.py | 5 +-
pandas/core/groupby/base.py | 233 +-
pandas/core/groupby/categorical.py | 15 +-
pandas/core/groupby/generic.py | 779 ++-
pandas/core/groupby/groupby.py | 725 ++-
pandas/core/groupby/grouper.py | 253 +-
pandas/core/groupby/ops.py | 405 +-
pandas/core/index.py | 28 +-
pandas/core/indexers.py | 236 +
pandas/core/indexes/accessors.py | 112 +-
pandas/core/indexes/api.py | 83 +-
pandas/core/indexes/base.py | 1793 +++---
pandas/core/indexes/category.py | 296 +-
pandas/core/indexes/datetimelike.py | 238 +-
pandas/core/indexes/datetimes.py | 678 +-
pandas/core/indexes/frozen.py | 27 +-
pandas/core/indexes/interval.py | 847 +--
pandas/core/indexes/multi.py | 1277 ++--
pandas/core/indexes/numeric.py | 234 +-
pandas/core/indexes/period.py | 443 +-
pandas/core/indexes/range.py | 354 +-
pandas/core/indexes/timedeltas.py | 326 +-
pandas/core/indexing.py | 1200 ++--
pandas/core/internals/__init__.py | 40 +-
pandas/core/internals/arrays.py | 55 -
pandas/core/internals/blocks.py | 1959 +++---
pandas/core/internals/concat.py | 161 +-
pandas/core/internals/construction.py | 432 +-
pandas/core/internals/managers.py | 836 ++-
pandas/core/missing.py | 436 +-
pandas/core/nanops.py | 346 +-
pandas/core/ops.py | 2354 -------
pandas/core/ops/__init__.py | 1234 ++++
pandas/core/ops/array_ops.py | 128 +
pandas/core/ops/docstrings.py | 675 ++
pandas/core/ops/invalid.py | 61 +
pandas/core/ops/methods.py | 249 +
pandas/core/ops/missing.py | 194 +
pandas/core/ops/roperator.py | 61 +
pandas/core/panel.py | 1576 -----
pandas/core/resample.py | 543 +-
pandas/core/reshape/concat.py | 269 +-
pandas/core/reshape/melt.py | 97 +-
pandas/core/reshape/merge.py | 989 +--
pandas/core/reshape/pivot.py | 300 +-
pandas/core/reshape/reshape.py | 349 +-
pandas/core/reshape/tile.py | 186 +-
pandas/core/reshape/util.py | 9 +-
pandas/core/series.py | 1633 +++--
pandas/core/sorting.py | 148 +-
pandas/core/sparse/frame.py | 455 +-
pandas/core/sparse/scipy_sparse.py | 51 +-
pandas/core/sparse/series.py | 390 +-
pandas/core/strings.py | 977 +--
pandas/core/tools/datetimes.py | 524 +-
pandas/core/tools/numeric.py | 50 +-
pandas/core/tools/timedeltas.py | 59 +-
pandas/core/util/hashing.py | 142 +-
pandas/core/window.py | 2656 --------
pandas/core/window/__init__.py | 3 +
pandas/core/window/common.py | 276 +
pandas/core/window/ewm.py | 388 ++
pandas/core/window/expanding.py | 260 +
pandas/core/window/rolling.py | 1939 ++++++
pandas/errors/__init__.py | 11 +-
pandas/io/api.py | 1 +
pandas/io/clipboard/__init__.py | 44 +-
pandas/io/clipboard/clipboards.py | 64 +-
pandas/io/clipboard/exceptions.py | 1 -
pandas/io/clipboard/windows.py | 49 +-
pandas/io/clipboards.py | 64 +-
pandas/io/common.py | 335 +-
pandas/io/date_converters.py | 15 +-
pandas/io/excel/__init__.py | 2 +-
pandas/io/excel/_base.py | 389 +-
pandas/io/excel/_odfreader.py | 180 +
pandas/io/excel/_openpyxl.py | 165 +-
pandas/io/excel/_util.py | 47 +-
pandas/io/excel/_xlrd.py | 52 +-
pandas/io/excel/_xlsxwriter.py | 240 +-
pandas/io/excel/_xlwt.py | 59 +-
pandas/io/feather_format.py | 100 +-
pandas/io/formats/console.py | 17 +-
pandas/io/formats/css.py | 156 +-
pandas/io/formats/csvs.py | 196 +-
pandas/io/formats/excel.py | 456 +-
pandas/io/formats/format.py | 1062 ++--
pandas/io/formats/html.py | 366 +-
pandas/io/formats/latex.py | 168 +-
pandas/io/formats/printing.py | 330 +-
pandas/io/formats/style.py | 463 +-
pandas/io/gbq.py | 100 +-
pandas/io/gcs.py | 19 +-
pandas/io/html.py | 317 +-
pandas/io/json/__init__.py | 15 +-
pandas/io/json/{json.py => _json.py} | 668 +-
.../io/json/{normalize.py => _normalize.py} | 187 +-
.../{table_schema.py => _table_schema.py} | 158 +-
pandas/io/msgpack/__init__.py | 4 +-
pandas/io/msgpack/_packer.pyi | 22 +
pandas/io/msgpack/_packer.pyx | 2 +-
pandas/io/msgpack/_unpacker.pyi | 59 +
pandas/io/msgpack/_unpacker.pyx | 19 +-
pandas/io/msgpack/exceptions.py | 1 -
pandas/io/packers.py | 832 +--
pandas/io/parquet.py | 204 +-
pandas/io/parsers.py | 1729 +++---
pandas/io/pickle.py | 18 +-
pandas/io/pytables.py | 2212 ++++---
pandas/io/s3.py | 40 +-
pandas/io/sas/sas.pyx | 9 +-
pandas/io/sas/sas7bdat.py | 326 +-
pandas/io/sas/sas_constants.py | 140 +-
pandas/io/sas/sas_xport.py | 190 +-
pandas/io/sas/sasreader.py | 52 +-
pandas/io/spss.py | 44 +
pandas/io/sql.py | 679 +-
pandas/io/stata.py | 1263 ++--
pandas/plotting/__init__.py | 113 +-
pandas/plotting/_compat.py | 20 -
pandas/plotting/_core.py | 3471 +++--------
pandas/plotting/_matplotlib/__init__.py | 85 +
pandas/plotting/_matplotlib/boxplot.py | 416 ++
pandas/plotting/_matplotlib/compat.py | 22 +
.../converter.py} | 474 +-
pandas/plotting/_matplotlib/core.py | 1527 +++++
pandas/plotting/_matplotlib/hist.py | 421 ++
pandas/plotting/_matplotlib/misc.py | 431 ++
.../{_style.py => _matplotlib/style.py} | 100 +-
.../timeseries.py} | 155 +-
.../{_tools.py => _matplotlib/tools.py} | 152 +-
pandas/plotting/_misc.py | 635 +-
pandas/testing.py | 5 +-
pandas/tests/api/test_api.py | 231 +-
pandas/tests/api/test_types.py | 72 +-
pandas/tests/arithmetic/conftest.py | 156 +-
pandas/tests/arithmetic/test_datetime64.py | 1869 +++---
pandas/tests/arithmetic/test_numeric.py | 697 ++-
pandas/tests/arithmetic/test_object.py | 193 +-
pandas/tests/arithmetic/test_period.py | 723 ++-
pandas/tests/arithmetic/test_timedelta64.py | 1185 ++--
pandas/tests/arrays/categorical/common.py | 6 +-
pandas/tests/arrays/categorical/test_algos.py | 78 +-
.../arrays/categorical/test_analytics.py | 144 +-
pandas/tests/arrays/categorical/test_api.py | 286 +-
.../arrays/categorical/test_constructors.py | 293 +-
.../tests/arrays/categorical/test_dtypes.py | 140 +-
.../tests/arrays/categorical/test_indexing.py | 173 +-
.../tests/arrays/categorical/test_missing.py | 42 +-
.../arrays/categorical/test_operators.py | 247 +-
pandas/tests/arrays/categorical/test_repr.py | 116 +-
.../tests/arrays/categorical/test_sorting.py | 38 +-
.../tests/arrays/categorical/test_subclass.py | 13 +-
.../tests/arrays/categorical/test_warnings.py | 12 +-
pandas/tests/arrays/interval/test_interval.py | 73 +-
pandas/tests/arrays/interval/test_ops.py | 56 +-
pandas/tests/arrays/sparse/test_accessor.py | 94 +-
.../tests/arrays/sparse/test_arithmetics.py | 594 +-
pandas/tests/arrays/sparse/test_array.py | 472 +-
pandas/tests/arrays/sparse/test_dtype.py | 182 +-
pandas/tests/arrays/sparse/test_libsparse.py | 266 +-
pandas/tests/arrays/test_array.py | 381 +-
pandas/tests/arrays/test_datetimelike.py | 268 +-
pandas/tests/arrays/test_datetimes.py | 217 +-
pandas/tests/arrays/test_integer.py | 384 +-
pandas/tests/arrays/test_numpy.py | 93 +-
pandas/tests/arrays/test_period.py | 194 +-
pandas/tests/arrays/test_timedeltas.py | 93 +-
pandas/tests/computation/test_compat.py | 18 +-
pandas/tests/computation/test_eval.py | 1209 ++--
pandas/tests/config/test_config.py | 384 +-
pandas/tests/config/test_localization.py | 8 +-
.../dtypes/cast/test_construct_from_scalar.py | 6 +-
.../dtypes/cast/test_construct_ndarray.py | 17 +-
.../dtypes/cast/test_construct_object_arr.py | 6 +-
.../tests/dtypes/cast/test_convert_objects.py | 7 +-
pandas/tests/dtypes/cast/test_downcast.py | 30 +-
.../dtypes/cast/test_find_common_type.py | 154 +-
.../dtypes/cast/test_infer_datetimelike.py | 13 +-
pandas/tests/dtypes/cast/test_infer_dtype.py | 93 +-
pandas/tests/dtypes/cast/test_promote.py | 862 +++
pandas/tests/dtypes/cast/test_upcast.py | 90 +-
pandas/tests/dtypes/test_common.py | 499 +-
pandas/tests/dtypes/test_concat.py | 100 +-
pandas/tests/dtypes/test_dtypes.py | 711 ++-
pandas/tests/dtypes/test_generic.py | 38 +-
pandas/tests/dtypes/test_inference.py | 1025 ++--
pandas/tests/dtypes/test_missing.py | 348 +-
.../extension/arrow/{bool.py => arrays.py} | 90 +-
pandas/tests/extension/arrow/test_bool.py | 20 +-
pandas/tests/extension/arrow/test_string.py | 13 +
pandas/tests/extension/base/__init__.py | 10 +-
pandas/tests/extension/base/base.py | 5 +-
pandas/tests/extension/base/constructors.py | 5 +-
pandas/tests/extension/base/dtype.py | 27 +-
pandas/tests/extension/base/getitem.py | 64 +-
pandas/tests/extension/base/groupby.py | 53 +-
pandas/tests/extension/base/interface.py | 32 +-
pandas/tests/extension/base/io.py | 13 +-
pandas/tests/extension/base/methods.py | 153 +-
pandas/tests/extension/base/missing.py | 55 +-
pandas/tests/extension/base/ops.py | 21 +-
pandas/tests/extension/base/printing.py | 6 +-
pandas/tests/extension/base/reduce.py | 13 +-
pandas/tests/extension/base/reshaping.py | 208 +-
pandas/tests/extension/base/setitem.py | 46 +-
pandas/tests/extension/conftest.py | 27 +-
pandas/tests/extension/decimal/__init__.py | 5 +-
pandas/tests/extension/decimal/array.py | 65 +-
.../tests/extension/decimal/test_decimal.py | 198 +-
pandas/tests/extension/json/__init__.py | 2 +-
pandas/tests/extension/json/array.py | 53 +-
pandas/tests/extension/json/test_json.py | 84 +-
pandas/tests/extension/test_categorical.py | 34 +-
pandas/tests/extension/test_common.py | 31 +-
pandas/tests/extension/test_datetime.py | 106 +-
pandas/tests/extension/test_external_block.py | 34 +-
pandas/tests/extension/test_integer.py | 59 +-
pandas/tests/extension/test_interval.py | 12 +-
pandas/tests/extension/test_numpy.py | 66 +-
pandas/tests/extension/test_period.py | 30 +-
pandas/tests/extension/test_sparse.py | 103 +-
pandas/tests/frame/common.py | 114 +-
pandas/tests/frame/conftest.py | 83 +-
pandas/tests/frame/test_alter_axes.py | 1317 ++--
pandas/tests/frame/test_analytics.py | 2009 +++---
pandas/tests/frame/test_api.py | 277 +-
pandas/tests/frame/test_apply.py | 1013 +--
pandas/tests/frame/test_arithmetic.py | 322 +-
pandas/tests/frame/test_asof.py | 79 +-
.../tests/frame/test_axis_select_reindex.py | 972 +--
pandas/tests/frame/test_block_internals.py | 464 +-
pandas/tests/frame/test_combine_concat.py | 802 +--
pandas/tests/frame/test_constructors.py | 2021 +++---
pandas/tests/frame/test_convert_to.py | 634 +-
pandas/tests/frame/test_dtypes.py | 1285 ++--
pandas/tests/frame/test_duplicates.py | 307 +-
pandas/tests/frame/test_explode.py | 120 +
pandas/tests/frame/test_indexing.py | 2454 ++++----
pandas/tests/frame/test_join.py | 183 +-
pandas/tests/frame/test_missing.py | 766 ++-
pandas/tests/frame/test_mutate_columns.py | 228 +-
pandas/tests/frame/test_nonunique_indexes.py | 497 +-
pandas/tests/frame/test_operators.py | 544 +-
pandas/tests/frame/test_period.py | 101 +-
pandas/tests/frame/test_quantile.py | 424 +-
pandas/tests/frame/test_query_eval.py | 711 +--
pandas/tests/frame/test_rank.py | 215 +-
pandas/tests/frame/test_replace.py | 1157 ++--
pandas/tests/frame/test_repr_info.py | 319 +-
pandas/tests/frame/test_reshape.py | 1021 +--
.../frame/test_sort_values_level_as_str.py | 71 +-
pandas/tests/frame/test_sorting.py | 622 +-
pandas/tests/frame/test_subclass.py | 586 +-
pandas/tests/frame/test_timeseries.py | 590 +-
pandas/tests/frame/test_timezones.py | 176 +-
pandas/tests/frame/test_to_csv.py | 929 +--
pandas/tests/frame/test_validate.py | 19 +-
pandas/tests/generic/test_frame.py | 220 +-
pandas/tests/generic/test_generic.py | 356 +-
.../generic/test_label_or_level_utils.py | 99 +-
pandas/tests/generic/test_series.py | 151 +-
.../tests/groupby/aggregate/test_aggregate.py | 455 +-
pandas/tests/groupby/aggregate/test_cython.py | 216 +-
pandas/tests/groupby/aggregate/test_other.py | 605 +-
pandas/tests/groupby/conftest.py | 116 +-
pandas/tests/groupby/test_apply.py | 451 +-
pandas/tests/groupby/test_bin_groupby.py | 69 +-
pandas/tests/groupby/test_categorical.py | 1120 ++--
pandas/tests/groupby/test_counting.py | 128 +-
pandas/tests/groupby/test_filters.py | 338 +-
pandas/tests/groupby/test_function.py | 1364 +++--
pandas/tests/groupby/test_groupby.py | 1218 ++--
pandas/tests/groupby/test_grouping.py | 675 +-
pandas/tests/groupby/test_index_as_string.py | 70 +-
pandas/tests/groupby/test_nth.py | 524 +-
pandas/tests/groupby/test_rank.py | 566 +-
pandas/tests/groupby/test_timegrouper.py | 851 +--
pandas/tests/groupby/test_transform.py | 823 ++-
pandas/tests/groupby/test_value_counts.py | 48 +-
pandas/tests/groupby/test_whitelist.py | 371 +-
pandas/tests/indexes/common.py | 215 +-
pandas/tests/indexes/conftest.py | 44 +-
pandas/tests/indexes/datetimelike.py | 17 +-
.../indexes/datetimes/test_arithmetic.py | 93 +-
pandas/tests/indexes/datetimes/test_astype.py | 283 +-
.../indexes/datetimes/test_construction.py | 842 ++-
.../indexes/datetimes/test_date_range.py | 706 ++-
.../tests/indexes/datetimes/test_datetime.py | 221 +-
.../indexes/datetimes/test_datetimelike.py | 9 +-
.../tests/indexes/datetimes/test_formats.py | 264 +-
.../tests/indexes/datetimes/test_indexing.py | 605 +-
pandas/tests/indexes/datetimes/test_misc.py | 297 +-
.../tests/indexes/datetimes/test_missing.py | 82 +-
pandas/tests/indexes/datetimes/test_ops.py | 312 +-
.../indexes/datetimes/test_partial_slicing.py | 394 +-
.../indexes/datetimes/test_scalar_compat.py | 272 +-
pandas/tests/indexes/datetimes/test_setops.py | 263 +-
.../tests/indexes/datetimes/test_timezones.py | 964 +--
pandas/tests/indexes/datetimes/test_tools.py | 2089 ++++---
pandas/tests/indexes/interval/test_astype.py | 139 +-
.../indexes/interval/test_construction.py | 259 +-
.../tests/indexes/interval/test_interval.py | 844 ++-
.../indexes/interval/test_interval_new.py | 310 +-
.../indexes/interval/test_interval_range.py | 229 +-
.../indexes/interval/test_interval_tree.py | 117 +-
pandas/tests/indexes/interval/test_setops.py | 61 +-
pandas/tests/indexes/multi/conftest.py | 59 +-
pandas/tests/indexes/multi/test_analytics.py | 203 +-
pandas/tests/indexes/multi/test_astype.py | 8 +-
pandas/tests/indexes/multi/test_compat.py | 10 +-
.../tests/indexes/multi/test_constructor.py | 481 +-
pandas/tests/indexes/multi/test_contains.py | 66 +-
pandas/tests/indexes/multi/test_conversion.py | 164 +-
pandas/tests/indexes/multi/test_copy.py | 35 +-
pandas/tests/indexes/multi/test_drop.py | 86 +-
pandas/tests/indexes/multi/test_duplicates.py | 156 +-
.../tests/indexes/multi/test_equivalence.py | 38 +-
pandas/tests/indexes/multi/test_format.py | 184 +-
pandas/tests/indexes/multi/test_get_set.py | 154 +-
pandas/tests/indexes/multi/test_indexing.py | 224 +-
pandas/tests/indexes/multi/test_integrity.py | 127 +-
pandas/tests/indexes/multi/test_join.py | 50 +-
pandas/tests/indexes/multi/test_missing.py | 61 +-
pandas/tests/indexes/multi/test_monotonic.py | 131 +-
pandas/tests/indexes/multi/test_names.py | 57 +-
.../indexes/multi/test_partial_indexing.py | 46 +-
pandas/tests/indexes/multi/test_reindex.py | 43 +-
pandas/tests/indexes/multi/test_reshape.py | 100 +-
pandas/tests/indexes/multi/test_set_ops.py | 77 +-
pandas/tests/indexes/multi/test_sorting.py | 140 +-
.../tests/indexes/period/test_arithmetic.py | 87 +-
pandas/tests/indexes/period/test_asfreq.py | 203 +-
pandas/tests/indexes/period/test_astype.py | 78 +-
.../tests/indexes/period/test_construction.py | 398 +-
pandas/tests/indexes/period/test_formats.py | 191 +-
pandas/tests/indexes/period/test_indexing.py | 502 +-
pandas/tests/indexes/period/test_ops.py | 220 +-
.../indexes/period/test_partial_slicing.py | 121 +-
pandas/tests/indexes/period/test_period.py | 358 +-
.../tests/indexes/period/test_period_range.py | 70 +-
.../indexes/period/test_scalar_compat.py | 10 +-
pandas/tests/indexes/period/test_setops.py | 381 +-
pandas/tests/indexes/period/test_tools.py | 322 +-
pandas/tests/indexes/test_base.py | 1835 +++---
pandas/tests/indexes/test_category.py | 709 ++-
pandas/tests/indexes/test_common.py | 90 +-
pandas/tests/indexes/test_frozen.py | 4 +-
pandas/tests/indexes/test_numeric.py | 549 +-
pandas/tests/indexes/test_numpy_compat.py | 68 +-
pandas/tests/indexes/test_range.py | 458 +-
pandas/tests/indexes/test_setops.py | 67 +-
.../indexes/timedeltas/test_arithmetic.py | 185 +-
.../tests/indexes/timedeltas/test_astype.py | 79 +-
.../indexes/timedeltas/test_construction.py | 145 +-
.../tests/indexes/timedeltas/test_formats.py | 108 +-
.../tests/indexes/timedeltas/test_indexing.py | 265 +-
pandas/tests/indexes/timedeltas/test_ops.py | 152 +-
.../timedeltas/test_partial_slicing.py | 59 +-
.../indexes/timedeltas/test_scalar_compat.py | 50 +-
.../tests/indexes/timedeltas/test_setops.py | 98 +-
.../indexes/timedeltas/test_timedelta.py | 189 +-
.../timedeltas/test_timedelta_range.py | 51 +-
pandas/tests/indexes/timedeltas/test_tools.py | 156 +-
pandas/tests/indexing/common.py | 158 +-
pandas/tests/indexing/conftest.py | 27 +-
.../tests/indexing/interval/test_interval.py | 226 +-
.../indexing/interval/test_interval_new.py | 110 +-
pandas/tests/indexing/multiindex/conftest.py | 23 +-
.../multiindex/test_chaining_and_caching.py | 29 +-
.../indexing/multiindex/test_datetime.py | 8 +-
.../tests/indexing/multiindex/test_getitem.py | 210 +-
pandas/tests/indexing/multiindex/test_iloc.py | 82 +-
.../indexing/multiindex/test_indexing_slow.py | 53 +-
pandas/tests/indexing/multiindex/test_ix.py | 63 +-
pandas/tests/indexing/multiindex/test_loc.py | 289 +-
.../indexing/multiindex/test_multiindex.py | 82 +-
.../tests/indexing/multiindex/test_partial.py | 143 +-
.../tests/indexing/multiindex/test_set_ops.py | 25 +-
.../tests/indexing/multiindex/test_setitem.py | 454 +-
.../tests/indexing/multiindex/test_slice.py | 574 +-
.../tests/indexing/multiindex/test_sorted.py | 55 +-
pandas/tests/indexing/multiindex/test_xs.py | 192 +-
pandas/tests/indexing/test_callable.py | 167 +-
pandas/tests/indexing/test_categorical.py | 521 +-
.../indexing/test_chaining_and_caching.py | 268 +-
pandas/tests/indexing/test_coercion.py | 919 +--
pandas/tests/indexing/test_datetime.py | 253 +-
pandas/tests/indexing/test_floats.py | 852 ++-
pandas/tests/indexing/test_iloc.py | 437 +-
pandas/tests/indexing/test_indexing.py | 1086 ++--
.../tests/indexing/test_indexing_engines.py | 33 +-
pandas/tests/indexing/test_indexing_slow.py | 5 +-
pandas/tests/indexing/test_ix.py | 305 +-
pandas/tests/indexing/test_loc.py | 952 ++-
pandas/tests/indexing/test_partial.py | 324 +-
pandas/tests/indexing/test_scalar.py | 123 +-
pandas/tests/indexing/test_timedelta.py | 111 +-
pandas/tests/internals/test_internals.py | 941 +--
pandas/tests/io/conftest.py | 41 +-
pandas/tests/io/data/blank.ods | Bin 0 -> 2813 bytes
pandas/tests/io/data/blank_with_header.ods | Bin 0 -> 2893 bytes
pandas/tests/io/data/invalid_value_type.ods | Bin 0 -> 8502 bytes
pandas/tests/io/data/labelled-num-na.sav | Bin 0 -> 535 bytes
pandas/tests/io/data/labelled-num.sav | Bin 0 -> 507 bytes
pandas/tests/io/data/labelled-str.sav | Bin 0 -> 525 bytes
.../0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack | Bin 4445 -> 0 bytes
.../0.16.2_AMD64_windows_2.7.10.msgpack | Bin 4745 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack | Bin 4745 -> 0 bytes
.../0.16.2_x86_64_darwin_2.7.10.msgpack | Bin 6196 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack | Bin 4745 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack | Bin 6196 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin 4684 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin 4684 -> 0 bytes
.../0.17.0_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.0_x86_64_darwin_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_x86_64_darwin_2.7.11.msgpack | Bin 11323 -> 0 bytes
.../0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin 10307 -> 0 bytes
.../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.18.0_AMD64_windows_2.7.11.msgpack | Bin 8386 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack | Bin 8341 -> 0 bytes
.../0.18.0_x86_64_darwin_2.7.11.msgpack | Bin 8386 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack | Bin 8341 -> 0 bytes
.../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin 119258 -> 0 bytes
.../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin 119206 -> 0 bytes
.../0.19.2_x86_64_darwin_2.7.12.msgpack | Bin 12325 -> 0 bytes
.../0.20.3_x86_64_darwin_3.5.2.msgpack} | Bin 119196 -> 118654 bytes
.../0.10.1/AMD64_windows_2.7.3.pickle | Bin 4381 -> 0 bytes
.../0.10.1/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes
.../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin 8978 -> 0 bytes
.../0.11.0/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes
.../0.11.0/x86_64_linux_3.3.0.pickle | Bin 5822 -> 0 bytes
.../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin 8692 -> 0 bytes
.../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin 8768 -> 0 bytes
.../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin 7208 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin 7143 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin 7123 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin 10019 -> 0 bytes
.../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin 7278 -> 0 bytes
.../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin 7445 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin 7278 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_2.7.8.pickle | Bin 7639 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin 10049 -> 0 bytes
.../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin 8159 -> 0 bytes
.../0.14.0/0.14.0_x86_64_linux_2.7.8.pickle | Bin 9309 -> 0 bytes
.../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin 191074 -> 0 bytes
.../0.14.1/0.14.1_x86_64_linux_2.7.8.pickle | Bin 11930 -> 0 bytes
.../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin 127687 -> 0 bytes
.../0.15.0/0.15.0_x86_64_linux_2.7.8.pickle | Bin 15162 -> 0 bytes
.../0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle | Bin 14892 -> 0 bytes
.../0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_2.7.10.pickle | Bin 15173 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_2.7.14.pickle | Bin 132692 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_3.4.3.pickle | Bin 13766 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle | Bin 16598 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle | Bin 15444 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin 14893 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin 14116 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_3.4.4.pickle | Bin 16236 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle | Bin 16026 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle | Bin 129175 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_3.4.4.pickle | Bin 16581 -> 0 bytes
.../0.17.0/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.1/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_2.7.11.pickle | Bin 16875 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_3.5.1.pickle | Bin 14674 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle | Bin 16718 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle | Bin 14671 -> 0 bytes
.../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin 127584 -> 0 bytes
.../0.19.2/0.19.2_AMD64_windows_2.7.14.pickle | Bin 133468 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle | Bin 127525 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle | Bin 132762 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle | Bin 126076 -> 0 bytes
.../0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle | Bin 132857 -> 0 bytes
.../0.20.3_x86_64_darwin_3.5.2.pickle} | Bin 127853 -> 127923 bytes
pandas/tests/io/data/test1.ods | Bin 0 -> 4440 bytes
pandas/tests/io/data/test1.xlsm | Bin 13967 -> 12091 bytes
pandas/tests/io/data/test1.xlsx | Bin 13878 -> 12074 bytes
pandas/tests/io/data/test2.ods | Bin 0 -> 2877 bytes
pandas/tests/io/data/test3.ods | Bin 0 -> 2889 bytes
pandas/tests/io/data/test4.ods | Bin 0 -> 2992 bytes
pandas/tests/io/data/test4.xls | Bin 25600 -> 25600 bytes
pandas/tests/io/data/test4.xlsm | Bin 8022 -> 8360 bytes
pandas/tests/io/data/test4.xlsx | Bin 28216 -> 8344 bytes
pandas/tests/io/data/test5.ods | Bin 0 -> 2906 bytes
pandas/tests/io/data/test5.xls | Bin 20480 -> 20480 bytes
pandas/tests/io/data/test5.xlsm | Bin 8017 -> 8642 bytes
pandas/tests/io/data/test5.xlsx | Bin 8002 -> 8626 bytes
pandas/tests/io/data/test_converters.ods | Bin 0 -> 3287 bytes
.../tests/io/data/test_index_name_pre17.ods | Bin 0 -> 3699 bytes
pandas/tests/io/data/test_multisheet.ods | Bin 0 -> 3797 bytes
pandas/tests/io/data/test_multisheet.xls | Bin 24576 -> 24576 bytes
pandas/tests/io/data/test_multisheet.xlsm | Bin 11148 -> 11313 bytes
pandas/tests/io/data/test_multisheet.xlsx | Bin 11131 -> 11296 bytes
pandas/tests/io/data/test_squeeze.ods | Bin 0 -> 3218 bytes
pandas/tests/io/data/test_squeeze.xls | Bin 26112 -> 26112 bytes
pandas/tests/io/data/test_squeeze.xlsm | Bin 8990 -> 9122 bytes
pandas/tests/io/data/test_squeeze.xlsx | Bin 8972 -> 9106 bytes
pandas/tests/io/data/test_types.ods | Bin 0 -> 3489 bytes
pandas/tests/io/data/test_types.xlsm | Bin 8733 -> 9042 bytes
pandas/tests/io/data/test_types.xlsx | Bin 33769 -> 9010 bytes
pandas/tests/io/data/testdateoverflow.ods | Bin 0 -> 3422 bytes
pandas/tests/io/data/testdtype.ods | Bin 0 -> 3196 bytes
pandas/tests/io/data/testmultiindex.ods | Bin 0 -> 5575 bytes
pandas/tests/io/data/testskiprows.ods | Bin 0 -> 3235 bytes
pandas/tests/io/data/times_1900.ods | Bin 0 -> 3181 bytes
pandas/tests/io/data/times_1904.ods | Bin 0 -> 3215 bytes
pandas/tests/io/data/umlauts.sav | Bin 0 -> 567 bytes
pandas/tests/io/data/writertable.odt | Bin 0 -> 10313 bytes
pandas/tests/io/excel/__init__.py | 0
pandas/tests/io/excel/conftest.py | 37 +
pandas/tests/io/excel/test_odf.py | 38 +
pandas/tests/io/excel/test_openpyxl.py | 103 +
pandas/tests/io/excel/test_readers.py | 952 +++
pandas/tests/io/excel/test_style.py | 169 +
pandas/tests/io/excel/test_writers.py | 1287 ++++
pandas/tests/io/excel/test_xlrd.py | 42 +
pandas/tests/io/excel/test_xlsxwriter.py | 64 +
pandas/tests/io/excel/test_xlwt.py | 67 +
pandas/tests/io/formats/test_console.py | 50 +-
pandas/tests/io/formats/test_css.py | 297 +-
.../tests/io/formats/test_eng_formatting.py | 161 +-
pandas/tests/io/formats/test_format.py | 2820 +++++----
pandas/tests/io/formats/test_printing.py | 111 +-
pandas/tests/io/formats/test_style.py | 1794 +++---
pandas/tests/io/formats/test_to_csv.py | 519 +-
pandas/tests/io/formats/test_to_excel.py | 471 +-
pandas/tests/io/formats/test_to_html.py | 640 +-
pandas/tests/io/formats/test_to_latex.py | 214 +-
.../tests/io/generate_legacy_storage_files.py | 439 +-
pandas/tests/io/json/test_compression.py | 43 +-
.../tests/io/json/test_json_table_schema.py | 854 +--
pandas/tests/io/json/test_normalize.py | 734 ++-
pandas/tests/io/json/test_pandas.py | 1407 +++--
pandas/tests/io/json/test_readlines.py | 83 +-
pandas/tests/io/json/test_ujson.py | 512 +-
pandas/tests/io/msgpack/test_buffer.py | 11 +-
pandas/tests/io/msgpack/test_case.py | 100 +-
pandas/tests/io/msgpack/test_except.py | 11 +-
pandas/tests/io/msgpack/test_extension.py | 50 +-
pandas/tests/io/msgpack/test_format.py | 101 +-
pandas/tests/io/msgpack/test_limits.py | 15 +-
pandas/tests/io/msgpack/test_newspec.py | 58 +-
pandas/tests/io/msgpack/test_obj.py | 33 +-
pandas/tests/io/msgpack/test_pack.py | 86 +-
pandas/tests/io/msgpack/test_read_size.py | 42 +-
pandas/tests/io/msgpack/test_seq.py | 2 +-
pandas/tests/io/msgpack/test_sequnpack.py | 77 +-
pandas/tests/io/msgpack/test_subtype.py | 2 +-
pandas/tests/io/msgpack/test_unpack.py | 20 +-
pandas/tests/io/msgpack/test_unpack_raw.py | 10 +-
pandas/tests/io/parser/conftest.py | 12 +-
pandas/tests/io/parser/test_c_parser_only.py | 289 +-
pandas/tests/io/parser/test_comment.py | 41 +-
pandas/tests/io/parser/test_common.py | 1114 ++--
pandas/tests/io/parser/test_compression.py | 22 +-
pandas/tests/io/parser/test_converters.py | 42 +-
pandas/tests/io/parser/test_dialect.py | 65 +-
pandas/tests/io/parser/test_dtypes.py | 319 +-
pandas/tests/io/parser/test_header.py | 404 +-
pandas/tests/io/parser/test_index_col.py | 111 +-
pandas/tests/io/parser/test_mangle_dupes.py | 104 +-
pandas/tests/io/parser/test_multi_thread.py | 37 +-
pandas/tests/io/parser/test_na_values.py | 384 +-
pandas/tests/io/parser/test_network.py | 126 +-
pandas/tests/io/parser/test_parse_dates.py | 1358 ++--
.../io/parser/test_python_parser_only.py | 96 +-
pandas/tests/io/parser/test_quoting.py | 88 +-
pandas/tests/io/parser/test_read_fwf.py | 237 +-
pandas/tests/io/parser/test_skiprows.py | 182 +-
pandas/tests/io/parser/test_textreader.py | 259 +-
pandas/tests/io/parser/test_unsupported.py | 61 +-
pandas/tests/io/parser/test_usecols.py | 374 +-
pandas/tests/io/pytables/__init__.py | 0
pandas/tests/io/pytables/test_compat.py | 76 +
pandas/tests/io/pytables/test_pytables.py | 5448 +++++++++++++++++
.../io/pytables/test_pytables_missing.py | 14 +
pandas/tests/io/sas/test_sas.py | 11 +-
pandas/tests/io/sas/test_sas7bdat.py | 88 +-
pandas/tests/io/sas/test_xport.py | 17 +-
pandas/tests/io/test_clipboard.py | 191 +-
pandas/tests/io/test_common.py | 291 +-
pandas/tests/io/test_compression.py | 125 +-
pandas/tests/io/test_date_converters.py | 13 +-
pandas/tests/io/test_excel.py | 2481 --------
pandas/tests/io/test_feather.py | 106 +-
pandas/tests/io/test_gbq.py | 70 +-
pandas/tests/io/test_gcs.py | 65 +-
pandas/tests/io/test_html.py | 665 +-
pandas/tests/io/test_packers.py | 591 +-
pandas/tests/io/test_parquet.py | 413 +-
pandas/tests/io/test_pickle.py | 149 +-
pandas/tests/io/test_pytables.py | 5154 ----------------
pandas/tests/io/test_s3.py | 8 +-
pandas/tests/io/test_spss.py | 73 +
pandas/tests/io/test_sql.py | 1766 +++---
pandas/tests/io/test_stata.py | 1482 +++--
pandas/tests/plotting/common.py | 171 +-
pandas/tests/plotting/test_backend.py | 89 +
pandas/tests/plotting/test_boxplot_method.py | 307 +-
pandas/tests/plotting/test_converter.py | 186 +-
pandas/tests/plotting/test_datetimelike.py | 653 +-
pandas/tests/plotting/test_frame.py | 1905 +++---
pandas/tests/plotting/test_groupby.py | 38 +-
pandas/tests/plotting/test_hist_method.py | 162 +-
pandas/tests/plotting/test_misc.py | 357 +-
pandas/tests/plotting/test_series.py | 362 +-
pandas/tests/reductions/test_reductions.py | 535 +-
.../tests/reductions/test_stat_reductions.py | 145 +-
pandas/tests/resample/conftest.py | 36 +-
pandas/tests/resample/test_base.py | 95 +-
pandas/tests/resample/test_datetime_index.py | 1277 ++--
pandas/tests/resample/test_period_index.py | 819 +--
pandas/tests/resample/test_resample_api.py | 522 +-
.../tests/resample/test_resampler_grouper.py | 238 +-
pandas/tests/resample/test_time_grouper.py | 241 +-
pandas/tests/resample/test_timedelta.py | 110 +-
pandas/tests/reshape/merge/test_join.py | 759 +--
pandas/tests/reshape/merge/test_merge.py | 2279 ++++---
pandas/tests/reshape/merge/test_merge_asof.py | 1582 +++--
.../merge/test_merge_index_as_string.py | 99 +-
.../tests/reshape/merge/test_merge_ordered.py | 100 +-
pandas/tests/reshape/merge/test_multi.py | 914 +--
pandas/tests/reshape/test_concat.py | 2042 +++---
pandas/tests/reshape/test_cut.py | 366 +-
pandas/tests/reshape/test_melt.py | 1269 ++--
pandas/tests/reshape/test_pivot.py | 2926 +++++----
pandas/tests/reshape/test_qcut.py | 131 +-
pandas/tests/reshape/test_reshape.py | 579 +-
.../tests/reshape/test_union_categoricals.py | 228 +-
pandas/tests/reshape/test_util.py | 16 +-
pandas/tests/scalar/interval/test_interval.py | 123 +-
pandas/tests/scalar/interval/test_ops.py | 26 +-
pandas/tests/scalar/period/test_asfreq.py | 1139 ++--
pandas/tests/scalar/period/test_period.py | 1224 ++--
pandas/tests/scalar/test_nat.py | 328 +-
.../tests/scalar/timedelta/test_arithmetic.py | 243 +-
.../scalar/timedelta/test_construction.py | 285 +-
pandas/tests/scalar/timedelta/test_formats.py | 49 +-
.../tests/scalar/timedelta/test_timedelta.py | 655 +-
.../tests/scalar/timestamp/test_arithmetic.py | 63 +-
.../scalar/timestamp/test_comparisons.py | 51 +-
.../tests/scalar/timestamp/test_rendering.py | 45 +-
.../tests/scalar/timestamp/test_timestamp.py | 640 +-
.../tests/scalar/timestamp/test_timezones.py | 350 +-
.../tests/scalar/timestamp/test_unary_ops.py | 299 +-
pandas/tests/series/common.py | 7 +-
pandas/tests/series/conftest.py | 6 +-
pandas/tests/series/indexing/conftest.py | 2 +-
.../tests/series/indexing/test_alter_index.py | 248 +-
pandas/tests/series/indexing/test_boolean.py | 196 +-
pandas/tests/series/indexing/test_callable.py | 16 +-
pandas/tests/series/indexing/test_datetime.py | 347 +-
pandas/tests/series/indexing/test_indexing.py | 413 +-
pandas/tests/series/indexing/test_loc.py | 32 +-
pandas/tests/series/indexing/test_numeric.py | 155 +-
pandas/tests/series/test_alter_axes.py | 234 +-
pandas/tests/series/test_analytics.py | 957 +--
pandas/tests/series/test_api.py | 323 +-
pandas/tests/series/test_apply.py | 562 +-
pandas/tests/series/test_arithmetic.py | 76 +-
pandas/tests/series/test_asof.py | 58 +-
pandas/tests/series/test_block_internals.py | 12 +-
pandas/tests/series/test_combine_concat.py | 296 +-
pandas/tests/series/test_constructors.py | 790 +--
pandas/tests/series/test_datetime_values.py | 539 +-
pandas/tests/series/test_dtypes.py | 370 +-
pandas/tests/series/test_duplicates.py | 82 +-
pandas/tests/series/test_explode.py | 113 +
pandas/tests/series/test_internals.py | 119 +-
pandas/tests/series/test_io.py | 143 +-
pandas/tests/series/test_missing.py | 1253 ++--
pandas/tests/series/test_operators.py | 324 +-
pandas/tests/series/test_period.py | 129 +-
pandas/tests/series/test_quantile.py | 122 +-
pandas/tests/series/test_rank.py | 463 +-
pandas/tests/series/test_replace.py | 140 +-
pandas/tests/series/test_repr.py | 155 +-
pandas/tests/series/test_sorting.py | 100 +-
pandas/tests/series/test_subclass.py | 60 +-
pandas/tests/series/test_timeseries.py | 572 +-
pandas/tests/series/test_timezones.py | 235 +-
pandas/tests/series/test_ufunc.py | 305 +
pandas/tests/series/test_validate.py | 9 +-
pandas/tests/sparse/frame/conftest.py | 33 +-
pandas/tests/sparse/frame/test_analytics.py | 4 +-
pandas/tests/sparse/frame/test_apply.py | 41 +-
pandas/tests/sparse/frame/test_frame.py | 1075 ++--
pandas/tests/sparse/frame/test_indexing.py | 74 +-
pandas/tests/sparse/frame/test_to_csv.py | 10 +-
.../tests/sparse/frame/test_to_from_scipy.py | 61 +-
pandas/tests/sparse/series/test_indexing.py | 82 +-
pandas/tests/sparse/series/test_series.py | 839 +--
pandas/tests/sparse/test_combine_concat.py | 231 +-
pandas/tests/sparse/test_format.py | 127 +-
pandas/tests/sparse/test_groupby.py | 56 +-
pandas/tests/sparse/test_indexing.py | 754 ++-
pandas/tests/sparse/test_pivot.py | 67 +-
pandas/tests/sparse/test_reshape.py | 4 +-
pandas/tests/test_algos.py | 1363 +++--
pandas/tests/test_base.py | 798 ++-
pandas/tests/test_common.py | 68 +-
pandas/tests/test_downstream.py | 73 +-
pandas/tests/test_errors.py | 24 +-
pandas/tests/test_expressions.py | 298 +-
pandas/tests/test_join.py | 196 +-
pandas/tests/test_lib.py | 37 +-
pandas/tests/test_multilevel.py | 1656 ++---
pandas/tests/test_nanops.py | 815 ++-
pandas/tests/test_optional_dependency.py | 52 +
pandas/tests/test_register_accessor.py | 46 +-
pandas/tests/test_sorting.py | 275 +-
pandas/tests/test_strings.py | 2606 ++++----
pandas/tests/test_take.py | 204 +-
pandas/tests/test_window.py | 4109 -------------
pandas/tests/tools/test_numeric.py | 371 +-
.../tseries/frequencies/test_freq_code.py | 184 +-
.../tseries/frequencies/test_inference.py | 326 +-
.../tseries/frequencies/test_to_offset.py | 184 +-
pandas/tests/tseries/holiday/test_calendar.py | 38 +-
pandas/tests/tseries/holiday/test_federal.py | 34 +-
pandas/tests/tseries/holiday/test_holiday.py | 289 +-
.../tests/tseries/holiday/test_observance.py | 70 +-
pandas/tests/tseries/offsets/common.py | 18 +-
pandas/tests/tseries/offsets/conftest.py | 10 +-
pandas/tests/tseries/offsets/test_fiscal.py | 613 +-
pandas/tests/tseries/offsets/test_offsets.py | 3989 +++++++-----
.../offsets/test_offsets_properties.py | 68 +-
pandas/tests/tseries/offsets/test_ticks.py | 174 +-
.../tests/tseries/offsets/test_yqm_offsets.py | 1624 +++--
pandas/tests/tslibs/test_api.py | 66 +-
pandas/tests/tslibs/test_array_to_datetime.py | 101 +-
pandas/tests/tslibs/test_ccalendar.py | 15 +-
pandas/tests/tslibs/test_conversion.py | 41 +-
pandas/tests/tslibs/test_fields.py | 31 +
pandas/tests/tslibs/test_libfrequencies.py | 146 +-
pandas/tests/tslibs/test_liboffsets.py | 162 +-
pandas/tests/tslibs/test_normalize_date.py | 25 +-
pandas/tests/tslibs/test_parse_iso8601.py | 76 +-
pandas/tests/tslibs/test_parsing.py | 194 +-
pandas/tests/tslibs/test_period_asfreq.py | 125 +-
pandas/tests/tslibs/test_timedeltas.py | 21 +-
pandas/tests/tslibs/test_timezones.py | 30 +-
pandas/tests/util/test_assert_almost_equal.py | 176 +-
.../util/test_assert_categorical_equal.py | 12 +-
.../util/test_assert_extension_array_equal.py | 25 +-
pandas/tests/util/test_assert_frame_equal.py | 164 +-
pandas/tests/util/test_assert_index_equal.py | 42 +-
.../util/test_assert_interval_array_equal.py | 13 +-
.../util/test_assert_numpy_array_equal.py | 46 +-
.../util/test_assert_produces_warning.py | 9 +-
pandas/tests/util/test_assert_series_equal.py | 75 +-
pandas/tests/util/test_deprecate.py | 27 +-
pandas/tests/util/test_deprecate_kwarg.py | 6 +-
pandas/tests/util/test_hashing.py | 176 +-
pandas/tests/util/test_move.py | 1 +
pandas/tests/util/test_safe_import.py | 13 +-
pandas/tests/util/test_util.py | 23 +-
pandas/tests/util/test_validate_args.py | 27 +-
.../util/test_validate_args_and_kwargs.py | 56 +-
pandas/tests/util/test_validate_kwargs.py | 18 +-
pandas/tests/window/__init__.py | 0
pandas/tests/window/common.py | 23 +
pandas/tests/window/conftest.py | 49 +
pandas/tests/window/test_api.py | 367 ++
pandas/tests/window/test_dtypes.py | 242 +
pandas/tests/window/test_ewm.py | 70 +
pandas/tests/window/test_expanding.py | 115 +
pandas/tests/window/test_grouper.py | 176 +
pandas/tests/window/test_moments.py | 2562 ++++++++
pandas/tests/window/test_pairwise.py | 183 +
pandas/tests/window/test_rolling.py | 336 +
pandas/tests/window/test_timeseries_window.py | 692 +++
pandas/tests/window/test_window.py | 76 +
pandas/tseries/converter.py | 34 +-
pandas/tseries/frequencies.py | 155 +-
pandas/tseries/holiday.py | 135 +-
pandas/tseries/offsets.py | 1204 ++--
pandas/tseries/plotting.py | 2 +-
pandas/util/__init__.py | 3 +-
pandas/util/_decorators.py | 238 +-
pandas/util/_depr_module.py | 32 +-
pandas/util/_doctools.py | 72 +-
pandas/util/_print_versions.py | 157 +-
pandas/util/_test_decorators.py | 150 +-
pandas/util/_tester.py | 6 +-
pandas/util/_validators.py | 96 +-
pandas/util/testing.py | 1185 ++--
requirements-dev.txt | 12 +-
scripts/download_wheels.py | 20 +-
scripts/find_commits_touching_func.py | 131 +-
scripts/generate_pip_deps_from_conda.py | 61 +-
scripts/merge-pr.py | 146 +-
scripts/tests/conftest.py | 7 +-
scripts/tests/test_validate_docstrings.py | 719 ++-
scripts/validate_docstrings.py | 714 ++-
setup.cfg | 45 +-
setup.py | 828 +--
versioneer.py | 213 +-
1094 files changed, 160096 insertions(+), 123771 deletions(-)
create mode 100644 .github/SECURITY.md
create mode 100644 .pre-commit-config.yaml
create mode 100644 LICENSES/HAVEN_LICENSE
create mode 100644 LICENSES/HAVEN_MIT
create mode 100644 asv_bench/benchmarks/index_cached_properties.py
delete mode 100755 ci/build_docs.sh
create mode 100755 ci/check_git_tags.sh
create mode 100644 ci/deps/azure-36-32bit.yaml
delete mode 100644 ci/deps/travis-36-doc.yaml
create mode 100644 doc/source/development/roadmap.rst
create mode 100644 doc/source/whatsnew/v0.25.1.rst
create mode 100644 doc/source/whatsnew/v0.25.2.rst
create mode 100644 doc/source/whatsnew/v1.0.0.rst
delete mode 100644 mypy.ini
create mode 100644 pandas/_libs/lib.pxd
create mode 100644 pandas/compat/_optional.py
delete mode 100644 pandas/core/arrays/array_.py
create mode 100644 pandas/core/construction.py
create mode 100644 pandas/core/indexers.py
delete mode 100644 pandas/core/internals/arrays.py
delete mode 100644 pandas/core/ops.py
create mode 100644 pandas/core/ops/__init__.py
create mode 100644 pandas/core/ops/array_ops.py
create mode 100644 pandas/core/ops/docstrings.py
create mode 100644 pandas/core/ops/invalid.py
create mode 100644 pandas/core/ops/methods.py
create mode 100644 pandas/core/ops/missing.py
create mode 100644 pandas/core/ops/roperator.py
delete mode 100644 pandas/core/panel.py
delete mode 100644 pandas/core/window.py
create mode 100644 pandas/core/window/__init__.py
create mode 100644 pandas/core/window/common.py
create mode 100644 pandas/core/window/ewm.py
create mode 100644 pandas/core/window/expanding.py
create mode 100644 pandas/core/window/rolling.py
create mode 100644 pandas/io/excel/_odfreader.py
rename pandas/io/json/{json.py => _json.py} (62%)
rename pandas/io/json/{normalize.py => _normalize.py} (56%)
rename pandas/io/json/{table_schema.py => _table_schema.py} (69%)
create mode 100644 pandas/io/msgpack/_packer.pyi
create mode 100644 pandas/io/msgpack/_unpacker.pyi
create mode 100644 pandas/io/spss.py
delete mode 100644 pandas/plotting/_compat.py
create mode 100644 pandas/plotting/_matplotlib/__init__.py
create mode 100644 pandas/plotting/_matplotlib/boxplot.py
create mode 100644 pandas/plotting/_matplotlib/compat.py
rename pandas/plotting/{_converter.py => _matplotlib/converter.py} (72%)
create mode 100644 pandas/plotting/_matplotlib/core.py
create mode 100644 pandas/plotting/_matplotlib/hist.py
create mode 100644 pandas/plotting/_matplotlib/misc.py
rename pandas/plotting/{_style.py => _matplotlib/style.py} (50%)
rename pandas/plotting/{_timeseries.py => _matplotlib/timeseries.py} (66%)
rename pandas/plotting/{_tools.py => _matplotlib/tools.py} (77%)
create mode 100644 pandas/tests/dtypes/cast/test_promote.py
rename pandas/tests/extension/arrow/{bool.py => arrays.py} (67%)
create mode 100644 pandas/tests/extension/arrow/test_string.py
create mode 100644 pandas/tests/frame/test_explode.py
create mode 100644 pandas/tests/io/data/blank.ods
create mode 100644 pandas/tests/io/data/blank_with_header.ods
create mode 100644 pandas/tests/io/data/invalid_value_type.ods
create mode 100755 pandas/tests/io/data/labelled-num-na.sav
create mode 100755 pandas/tests/io/data/labelled-num.sav
create mode 100755 pandas/tests/io/data/labelled-str.sav
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.19.2/0.19.2_x86_64_darwin_2.7.12.msgpack
rename pandas/tests/io/data/legacy_msgpack/{0.19.2/0.19.2_x86_64_darwin_3.6.1.msgpack => 0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack} (92%)
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle
rename pandas/tests/io/data/legacy_pickle/{0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle => 0.20.3/0.20.3_x86_64_darwin_3.5.2.pickle} (85%)
create mode 100644 pandas/tests/io/data/test1.ods
create mode 100644 pandas/tests/io/data/test2.ods
create mode 100644 pandas/tests/io/data/test3.ods
create mode 100644 pandas/tests/io/data/test4.ods
create mode 100644 pandas/tests/io/data/test5.ods
create mode 100644 pandas/tests/io/data/test_converters.ods
create mode 100644 pandas/tests/io/data/test_index_name_pre17.ods
create mode 100644 pandas/tests/io/data/test_multisheet.ods
create mode 100644 pandas/tests/io/data/test_squeeze.ods
create mode 100644 pandas/tests/io/data/test_types.ods
create mode 100644 pandas/tests/io/data/testdateoverflow.ods
create mode 100644 pandas/tests/io/data/testdtype.ods
create mode 100644 pandas/tests/io/data/testmultiindex.ods
create mode 100644 pandas/tests/io/data/testskiprows.ods
create mode 100644 pandas/tests/io/data/times_1900.ods
create mode 100644 pandas/tests/io/data/times_1904.ods
create mode 100755 pandas/tests/io/data/umlauts.sav
create mode 100644 pandas/tests/io/data/writertable.odt
create mode 100644 pandas/tests/io/excel/__init__.py
create mode 100644 pandas/tests/io/excel/conftest.py
create mode 100644 pandas/tests/io/excel/test_odf.py
create mode 100644 pandas/tests/io/excel/test_openpyxl.py
create mode 100644 pandas/tests/io/excel/test_readers.py
create mode 100644 pandas/tests/io/excel/test_style.py
create mode 100644 pandas/tests/io/excel/test_writers.py
create mode 100644 pandas/tests/io/excel/test_xlrd.py
create mode 100644 pandas/tests/io/excel/test_xlsxwriter.py
create mode 100644 pandas/tests/io/excel/test_xlwt.py
create mode 100644 pandas/tests/io/pytables/__init__.py
create mode 100644 pandas/tests/io/pytables/test_compat.py
create mode 100644 pandas/tests/io/pytables/test_pytables.py
create mode 100644 pandas/tests/io/pytables/test_pytables_missing.py
delete mode 100644 pandas/tests/io/test_excel.py
delete mode 100644 pandas/tests/io/test_pytables.py
create mode 100644 pandas/tests/io/test_spss.py
create mode 100644 pandas/tests/plotting/test_backend.py
create mode 100644 pandas/tests/series/test_explode.py
create mode 100644 pandas/tests/series/test_ufunc.py
create mode 100644 pandas/tests/test_optional_dependency.py
delete mode 100644 pandas/tests/test_window.py
create mode 100644 pandas/tests/tslibs/test_fields.py
create mode 100644 pandas/tests/window/__init__.py
create mode 100644 pandas/tests/window/common.py
create mode 100644 pandas/tests/window/conftest.py
create mode 100644 pandas/tests/window/test_api.py
create mode 100644 pandas/tests/window/test_dtypes.py
create mode 100644 pandas/tests/window/test_ewm.py
create mode 100644 pandas/tests/window/test_expanding.py
create mode 100644 pandas/tests/window/test_grouper.py
create mode 100644 pandas/tests/window/test_moments.py
create mode 100644 pandas/tests/window/test_pairwise.py
create mode 100644 pandas/tests/window/test_rolling.py
create mode 100644 pandas/tests/window/test_timeseries_window.py
create mode 100644 pandas/tests/window/test_window.py
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 6912d15abf3d6..944ce9b4fb1f6 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1 +1,2 @@
custom: https://pandas.pydata.org/donate.html
+tidelift: pypi/pandas
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4e1e9ce017408..7c3870470f074 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,5 @@
- [ ] closes #xxxx
- [ ] tests added / passed
+- [ ] passes `black pandas`
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
- [ ] whatsnew entry
diff --git a/.github/SECURITY.md b/.github/SECURITY.md
new file mode 100644
index 0000000000000..f3b059a5d4f13
--- /dev/null
+++ b/.github/SECURITY.md
@@ -0,0 +1 @@
+To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there.
diff --git a/.gitignore b/.gitignore
index 56828fa1d9331..e85da9c9b976b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,9 @@ coverage_html_report
# hypothesis test database
.hypothesis/
__pycache__
+# pytest-monkeytype
+monkeytype.sqlite3
+
# OS generated files #
######################
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..32ffb3330564c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+ - repo: https://github.com/python/black
+ rev: stable
+ hooks:
+ - id: black
+ language_version: python3.7
+ - repo: https://gitlab.com/pycqa/flake8
+ rev: 3.7.7
+ hooks:
+ - id: flake8
+ language: python_venv
+ additional_dependencies: [flake8-comprehensions]
+ - repo: https://github.com/pre-commit/mirrors-isort
+ rev: v4.3.20
+ hooks:
+ - id: isort
+ language: python_venv
diff --git a/.travis.yml b/.travis.yml
index ce8817133a477..79fecc41bec0d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,3 @@
-sudo: false
language: python
python: 3.5
@@ -22,7 +21,7 @@ env:
git:
# for cloning
- depth: 2000
+ depth: false
matrix:
fast_finish: true
@@ -48,17 +47,10 @@ matrix:
env:
- JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
- # In allow_failures
- - dist: trusty
- env:
- - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
allow_failures:
- dist: trusty
env:
- JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
- - dist: trusty
- env:
- - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
before_install:
- echo "before_install"
@@ -71,7 +63,7 @@ before_install:
- pwd
- uname -a
- git --version
- - git tag
+ - ./ci/check_git_tags.sh
# Because travis runs on Google Cloud and has a /etc/boto.cfg,
# it breaks moto import, see:
# https://github.com/spulec/moto/issues/1771
@@ -97,16 +89,10 @@ before_script:
script:
- echo "script start"
- source activate pandas-dev
- - ci/build_docs.sh
- ci/run_tests.sh
after_script:
- echo "after_script start"
- source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
- - if [ -e test-data-single.xml ]; then
- ci/print_skipped.py test-data-single.xml;
- fi
- - if [ -e test-data-multiple.xml ]; then
- ci/print_skipped.py test-data-multiple.xml;
- fi
+ - ci/print_skipped.py
- echo "after_script done"
diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE
new file mode 100644
index 0000000000000..2f444cb44d505
--- /dev/null
+++ b/LICENSES/HAVEN_LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2013-2016
+COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT
new file mode 100644
index 0000000000000..b03d0e640627a
--- /dev/null
+++ b/LICENSES/HAVEN_MIT
@@ -0,0 +1,32 @@
+Based on http://opensource.org/licenses/MIT
+
+This is a template. Complete and ship as file LICENSE the following 2
+lines (only)
+
+YEAR:
+COPYRIGHT HOLDER:
+
+and specify as
+
+License: MIT + file LICENSE
+
+Copyright (c) ,
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
index 956ff52338839..9e69eb7922925 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,6 @@
-tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx
- python setup.py build_ext --inplace
+.PHONY : develop build clean clean_pyc doc lint-diff black
-.PHONY : develop build clean clean_pyc tseries doc
+all: develop
clean:
-python setup.py clean
@@ -15,8 +14,11 @@ build: clean_pyc
lint-diff:
git diff upstream/master --name-only -- "*.py" | xargs flake8
+black:
+ black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)'
+
develop: build
- -python setup.py develop
+ python setup.py develop
doc:
-rm -rf doc/build doc/source/generated
diff --git a/README.md b/README.md
index e8bfd28cc8208..3cde98d3145f2 100644
--- a/README.md
+++ b/README.md
@@ -224,7 +224,7 @@ Most development discussion is taking place on github in this repo. Further, the
All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
-A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas-docs.github.io/pandas-docs-travis/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
@@ -233,3 +233,5 @@ You can also triage issues which may include reproducing bug reports, or asking
Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
+
+As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md)
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index fa098e2455683..571ede1a21134 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -107,7 +107,7 @@
// `asv` will cache wheels of the recent builds in each
// environment, making them faster to install next time. This is
// number of builds to keep, per environment.
- "wheel_cache_size": 8,
+ "build_cache_size": 8,
// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 45ef47fde0a56..7d97f2c740acb 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -2,10 +2,12 @@
import numpy as np
+from pandas._libs import lib
+
import pandas as pd
from pandas.util import testing as tm
-for imp in ['pandas.util', 'pandas.tools.hashing']:
+for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
break
@@ -13,17 +15,32 @@
pass
+class MaybeConvertObjects:
+ def setup(self):
+ N = 10 ** 5
+
+ data = list(range(N))
+ data[0] = pd.NaT
+ data = np.array(data)
+ self.data = data
+
+ def time_maybe_convert_objects(self):
+ lib.maybe_convert_objects(self.data)
+
+
class Factorize:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
def time_factorize(self, sort, dtype):
@@ -32,15 +49,17 @@ def time_factorize(self, sort, dtype):
class FactorizeUnique:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.arange(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.arange(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
assert self.idx.is_unique
@@ -50,15 +69,17 @@ def time_factorize(self, sort, dtype):
class Duplicated:
- params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
- param_names = ['keep', 'dtype']
+ params = [["first", "last", False], ["int", "uint", "float", "string"]]
+ param_names = ["keep", "dtype"]
def setup(self, keep, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -69,15 +90,17 @@ def time_duplicated(self, keep, dtype):
class DuplicatedUniqueIndex:
- params = ['int', 'uint', 'float', 'string']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "string"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.random.randn(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.random.randn(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -87,18 +110,21 @@ def time_duplicated_unique(self, dtype):
class Hashing:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
df = pd.DataFrame(
- {'strings': pd.Series(tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=N))),
- 'floats': np.random.randn(N),
- 'ints': np.arange(N),
- 'dates': pd.date_range('20110101', freq='s', periods=N),
- 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
- df['categories'] = df['strings'].astype('category')
+ {
+ "strings": pd.Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+ ),
+ "floats": np.random.randn(N),
+ "ints": np.arange(N),
+ "dates": pd.date_range("20110101", freq="s", periods=N),
+ "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N),
+ }
+ )
+ df["categories"] = df["strings"].astype("category")
df.iloc[10:20] = np.nan
return df
@@ -106,39 +132,55 @@ def time_frame(self, df):
hashing.hash_pandas_object(df)
def time_series_int(self, df):
- hashing.hash_pandas_object(df['ints'])
+ hashing.hash_pandas_object(df["ints"])
def time_series_string(self, df):
- hashing.hash_pandas_object(df['strings'])
+ hashing.hash_pandas_object(df["strings"])
def time_series_float(self, df):
- hashing.hash_pandas_object(df['floats'])
+ hashing.hash_pandas_object(df["floats"])
def time_series_categorical(self, df):
- hashing.hash_pandas_object(df['categories'])
+ hashing.hash_pandas_object(df["categories"])
def time_series_timedeltas(self, df):
- hashing.hash_pandas_object(df['timedeltas'])
+ hashing.hash_pandas_object(df["timedeltas"])
def time_series_dates(self, df):
- hashing.hash_pandas_object(df['dates'])
+ hashing.hash_pandas_object(df["dates"])
class Quantile:
- params = [[0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'],
- ['float', 'int', 'uint']]
- param_names = ['quantile', 'interpolation', 'dtype']
+ params = [
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ ["float", "int", "uint"],
+ ]
+ param_names = ["quantile", "interpolation", "dtype"]
def setup(self, quantile, interpolation, dtype):
- N = 10**5
- data = {'int': np.arange(N),
- 'uint': np.arange(N).astype(np.uint64),
- 'float': np.random.randn(N)}
+ N = 10 ** 5
+ data = {
+ "int": np.arange(N),
+ "uint": np.arange(N).astype(np.uint64),
+ "float": np.random.randn(N),
+ }
self.idx = pd.Series(data[dtype].repeat(5))
def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)
+class SortIntegerArray:
+ params = [10 ** 3, 10 ** 5]
+
+ def setup(self, N):
+ data = np.arange(N, dtype=float)
+ data[40] = np.nan
+ self.array = pd.array(data, dtype="Int64")
+
+ def time_argsort(self, N):
+ self.array.argsort()
+
+
from .pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py
index dd316a2bc88d0..c43e5dfd729aa 100644
--- a/asv_bench/benchmarks/attrs_caching.py
+++ b/asv_bench/benchmarks/attrs_caching.py
@@ -1,5 +1,6 @@
import numpy as np
from pandas import DataFrame
+
try:
from pandas.util import cache_readonly
except ImportError:
@@ -7,7 +8,6 @@
class DataFrameAttributes:
-
def setup(self):
self.df = DataFrame(np.random.randn(10, 6))
self.cur_index = self.df.index
@@ -20,14 +20,12 @@ def time_set_index(self):
class CacheReadonly:
-
def setup(self):
-
class Foo:
-
@cache_readonly
def prop(self):
return 5
+
self.obj = Foo()
def time_cache_readonly(self):
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
index 26cd66284c41e..fd3324b78f1c3 100644
--- a/asv_bench/benchmarks/binary_ops.py
+++ b/asv_bench/benchmarks/binary_ops.py
@@ -1,6 +1,7 @@
import numpy as np
from pandas import DataFrame, Series, date_range
from pandas.core.algorithms import checked_add_with_arr
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -9,14 +10,14 @@
class Ops:
- params = [[True, False], ['default', 1]]
- param_names = ['use_numexpr', 'threads']
+ params = [[True, False], ["default", 1]]
+ param_names = ["use_numexpr", "threads"]
def setup(self, use_numexpr, threads):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
- if threads != 'default':
+ if threads != "default":
expr.set_numexpr_threads(threads)
if not use_numexpr:
expr.set_use_numexpr(False)
@@ -39,18 +40,21 @@ def teardown(self, use_numexpr, threads):
class Ops2:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N, N))
self.df2 = DataFrame(np.random.randn(N, N))
- self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
- self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
+ self.df_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
+ self.df2_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
self.s = Series(np.random.randn(N))
@@ -90,16 +94,16 @@ def time_frame_series_dot(self):
class Timeseries:
- params = [None, 'US/Eastern']
- param_names = ['tz']
+ params = [None, "US/Eastern"]
+ param_names = ["tz"]
def setup(self, tz):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz))
+ self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz))
self.ts = self.s[halfway]
- self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz))
+ self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
def time_series_timestamp_compare(self, tz):
self.s <= self.ts
@@ -117,10 +121,10 @@ def time_timestamp_ops_diff_with_shift(self, tz):
class AddOverflowScalar:
params = [1, -1, 0]
- param_names = ['scalar']
+ param_names = ["scalar"]
def setup(self, scalar):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
def time_add_overflow_scalar(self, scalar):
@@ -128,9 +132,8 @@ def time_add_overflow_scalar(self, scalar):
class AddOverflowArray:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
self.arr_rev = np.arange(-N, 0)
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
@@ -144,12 +147,12 @@ def time_add_overflow_arr_mask_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
def time_add_overflow_b_mask_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed,
- b_mask=self.arr_nan_1)
+ checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1)
def time_add_overflow_both_arg_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
- b_mask=self.arr_nan_2)
+ checked_add_with_arr(
+ self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2
+ )
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 790157497ca36..8097118a79d20 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -1,8 +1,8 @@
-import warnings
-
import numpy as np
import pandas as pd
import pandas.util.testing as tm
+import warnings
+
try:
from pandas.api.types import union_categoricals
except ImportError:
@@ -13,13 +13,12 @@
class Concat:
-
def setup(self):
- N = 10**5
- self.s = pd.Series(list('aabbcd') * N).astype('category')
+ N = 10 ** 5
+ self.s = pd.Series(list("aabbcd") * N).astype("category")
- self.a = pd.Categorical(list('aabbcd') * N)
- self.b = pd.Categorical(list('bbcdjk') * N)
+ self.a = pd.Categorical(list("aabbcd") * N)
+ self.b = pd.Categorical(list("bbcdjk") * N)
def time_concat(self):
pd.concat([self.s, self.s])
@@ -29,23 +28,22 @@ def time_union(self):
class Constructor:
-
def setup(self):
- N = 10**5
- self.categories = list('abcde')
+ N = 10 ** 5
+ self.categories = list("abcde")
self.cat_idx = pd.Index(self.categories)
self.values = np.tile(self.categories, N)
self.codes = np.tile(range(len(self.categories)), N)
- self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00',
- periods=N / 10,
- freq='s'))
+ self.datetimes = pd.Series(
+ pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s")
+ )
self.datetimes_with_nat = self.datetimes.copy()
self.datetimes_with_nat.iloc[-1] = pd.NaT
self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
self.values_all_nan = [np.nan] * len(self.values)
- self.values_all_int8 = np.ones(N, 'int8')
+ self.values_all_int8 = np.ones(N, "int8")
self.categorical = pd.Categorical(self.values, self.categories)
self.series = pd.Series(self.categorical)
@@ -80,68 +78,61 @@ def time_existing_series(self):
class ValueCounts:
params = [True, False]
- param_names = ['dropna']
+ param_names = ["dropna"]
def setup(self, dropna):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_value_counts(self, dropna):
self.ts.value_counts(dropna=dropna)
class Repr:
-
def setup(self):
- self.sel = pd.Series(['s1234']).astype('category')
+ self.sel = pd.Series(["s1234"]).astype("category")
def time_rendering(self):
str(self.sel)
class SetCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])
class RemoveCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_remove_categories(self):
self.ts.cat.remove_categories(self.ts.cat.categories[::2])
class Rank:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
ncats = 100
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
- self.s_str_cat = self.s_str.astype('category')
+ self.s_str_cat = pd.Series(self.s_str, dtype="category")
with warnings.catch_warnings(record=True):
- self.s_str_cat_ordered = self.s_str.astype('category',
- ordered=True)
+ str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
+ self.s_str_cat_ordered = self.s_str.astype(str_cat_type)
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
- self.s_int_cat = self.s_int.astype('category')
+ self.s_int_cat = pd.Series(self.s_int, dtype="category")
with warnings.catch_warnings(record=True):
- self.s_int_cat_ordered = self.s_int.astype('category',
- ordered=True)
+ int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True)
+ self.s_int_cat_ordered = self.s_int.astype(int_cat_type)
def time_rank_string(self):
self.s_str.rank()
@@ -164,28 +155,27 @@ def time_rank_int_cat_ordered(self):
class Isin:
- params = ['object', 'int64']
- param_names = ['dtype']
+ params = ["object", "int64"]
+ param_names = ["dtype"]
def setup(self, dtype):
np.random.seed(1234)
- n = 5 * 10**5
+ n = 5 * 10 ** 5
sample_size = 100
arr = [i for i in np.random.randint(0, n // 10, size=n)]
- if dtype == 'object':
- arr = ['s{:04d}'.format(i) for i in arr]
+ if dtype == "object":
+ arr = ["s{:04d}".format(i) for i in arr]
self.sample = np.random.choice(arr, sample_size)
- self.series = pd.Series(arr).astype('category')
+ self.series = pd.Series(arr).astype("category")
def time_isin_categorical(self, dtype):
self.series.isin(self.sample)
class IsMonotonic:
-
def setup(self):
N = 1000
- self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N))
+ self.c = pd.CategoricalIndex(list("a" * N + "b" * N + "c" * N))
self.s = pd.Series(self.c)
def time_categorical_index_is_monotonic_increasing(self):
@@ -202,9 +192,8 @@ def time_categorical_series_is_monotonic_decreasing(self):
class Contains:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.ci = tm.makeCategoricalIndex(N)
self.c = self.ci.values
self.key = self.ci.categories[0]
@@ -218,34 +207,33 @@ def time_categorical_contains(self):
class CategoricalSlicing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**6
- categories = ['a', 'b', 'c']
+ N = 10 ** 6
+ categories = ["a", "b", "c"]
values = [0] * N + [1] * N + [2] * N
- if index == 'monotonic_incr':
- self.data = pd.Categorical.from_codes(values,
- categories=categories)
- elif index == 'monotonic_decr':
- self.data = pd.Categorical.from_codes(list(reversed(values)),
- categories=categories)
- elif index == 'non_monotonic':
- self.data = pd.Categorical.from_codes([0, 1, 2] * N,
- categories=categories)
+ if index == "monotonic_incr":
+ self.data = pd.Categorical.from_codes(values, categories=categories)
+ elif index == "monotonic_decr":
+ self.data = pd.Categorical.from_codes(
+ list(reversed(values)), categories=categories
+ )
+ elif index == "non_monotonic":
+ self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
else:
- raise ValueError('Invalid index param: {}'.format(index))
+ raise ValueError("Invalid index param: {}".format(index))
self.scalar = 10000
self.list = list(range(10000))
- self.cat_scalar = 'b'
+ self.cat_scalar = "b"
def time_getitem_scalar(self, index):
self.data[self.scalar]
def time_getitem_slice(self, index):
- self.data[:self.scalar]
+ self.data[: self.scalar]
def time_getitem_list_like(self, index):
self.data[[self.scalar]]
@@ -258,9 +246,8 @@ def time_getitem_bool_array(self, index):
class Indexing:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.index = pd.CategoricalIndex(range(N), range(N))
self.series = pd.Series(range(N), index=self.index).sort_index()
self.category = self.index[500]
@@ -275,7 +262,7 @@ def time_shallow_copy(self):
self.index._shallow_copy()
def time_align(self):
- pd.DataFrame({'a': self.series, 'b': self.series[:500]})
+ pd.DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
@@ -287,7 +274,7 @@ def time_reindex(self):
self.index.reindex(self.index[:500])
def time_reindex_missing(self):
- self.index.reindex(['a', 'b', 'c', 'd'])
+ self.index.reindex(["a", "b", "c", "d"])
def time_sort_values(self):
self.index.sort_values(ascending=False)
diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
index 1c6841a296377..654075292cdf6 100644
--- a/asv_bench/benchmarks/ctors.py
+++ b/asv_bench/benchmarks/ctors.py
@@ -42,22 +42,34 @@ def list_of_lists_with_none(arr):
class SeriesConstructors:
param_names = ["data_fmt", "with_index", "dtype"]
- params = [[no_change,
- list,
- list_of_str,
- gen_of_str,
- arr_dict,
- list_of_tuples,
- gen_of_tuples,
- list_of_lists,
- list_of_tuples_with_none,
- list_of_lists_with_none],
- [False, True],
- ['float', 'int']]
+ params = [
+ [
+ no_change,
+ list,
+ list_of_str,
+ gen_of_str,
+ arr_dict,
+ list_of_tuples,
+ gen_of_tuples,
+ list_of_lists,
+ list_of_tuples_with_none,
+ list_of_lists_with_none,
+ ],
+ [False, True],
+ ["float", "int"],
+ ]
+
+ # Generators get exhausted on use, so run setup before every call
+ number = 1
+ repeat = (3, 250, 10)
def setup(self, data_fmt, with_index, dtype):
- N = 10**4
- if dtype == 'float':
+ if data_fmt in (gen_of_str, gen_of_tuples) and with_index:
+ raise NotImplementedError(
+ "Series constructors do not support " "using generators with indexes"
+ )
+ N = 10 ** 4
+ if dtype == "float":
arr = np.random.randn(N)
else:
arr = np.arange(N)
@@ -69,13 +81,15 @@ def time_series_constructor(self, data_fmt, with_index, dtype):
class SeriesDtypesConstructors:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.arr = np.random.randn(N)
- self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object)
- self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
- Timestamp('20130101')] * N * 10)
+ self.arr_str = np.array(["foo", "bar", "baz"], dtype=object)
+ self.s = Series(
+ [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")]
+ * N
+ * 10
+ )
def time_index_from_array_string(self):
Index(self.arr_str)
@@ -91,9 +105,8 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.iterables = [tm.makeStringIndex(N), range(20)]
def time_multiindex_from_iterables(self):
diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
index 9bfaaa8696009..60800b1f9cae7 100644
--- a/asv_bench/benchmarks/dtypes.py
+++ b/asv_bench/benchmarks/dtypes.py
@@ -2,32 +2,36 @@
import numpy as np
from .pandas_vb_common import (
- numeric_dtypes, datetime_dtypes, string_dtypes, extension_dtypes)
+ numeric_dtypes,
+ datetime_dtypes,
+ string_dtypes,
+ extension_dtypes,
+)
-_numpy_dtypes = [np.dtype(dtype)
- for dtype in (numeric_dtypes +
- datetime_dtypes +
- string_dtypes)]
+_numpy_dtypes = [
+ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes)
+]
_dtypes = _numpy_dtypes + extension_dtypes
class Dtypes:
- params = (_dtypes +
- list(map(lambda dt: dt.name, _dtypes)))
- param_names = ['dtype']
+ params = _dtypes + list(map(lambda dt: dt.name, _dtypes))
+ param_names = ["dtype"]
def time_pandas_dtype(self, dtype):
pandas_dtype(dtype)
class DtypesInvalid:
- param_names = ['dtype']
- params = ['scalar-string', 'scalar-int', 'list-string', 'array-string']
- data_dict = {'scalar-string': 'foo',
- 'scalar-int': 1,
- 'list-string': ['foo'] * 1000,
- 'array-string': np.array(['foo'] * 1000)}
+ param_names = ["dtype"]
+ params = ["scalar-string", "scalar-int", "list-string", "array-string"]
+ data_dict = {
+ "scalar-string": "foo",
+ "scalar-int": 1,
+ "list-string": ["foo"] * 1000,
+ "array-string": np.array(["foo"] * 1000),
+ }
def time_pandas_dtype_invalid(self, dtype):
try:
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
index be47d35f2cad1..84e94315cc28b 100644
--- a/asv_bench/benchmarks/eval.py
+++ b/asv_bench/benchmarks/eval.py
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -8,8 +9,8 @@
class Eval:
- params = [['numexpr', 'python'], [1, 'all']]
- param_names = ['engine', 'threads']
+ params = [["numexpr", "python"], [1, "all"]]
+ param_names = ["engine", "threads"]
def setup(self, engine, threads):
self.df = pd.DataFrame(np.random.randn(20000, 100))
@@ -21,44 +22,44 @@ def setup(self, engine, threads):
expr.set_numexpr_threads(1)
def time_add(self, engine, threads):
- pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine)
+ pd.eval("self.df + self.df2 + self.df3 + self.df4", engine=engine)
def time_and(self, engine, threads):
- pd.eval('(self.df > 0) & (self.df2 > 0) & '
- '(self.df3 > 0) & (self.df4 > 0)', engine=engine)
+ pd.eval(
+ "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)",
+ engine=engine,
+ )
def time_chained_cmp(self, engine, threads):
- pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine)
+ pd.eval("self.df < self.df2 < self.df3 < self.df4", engine=engine)
def time_mult(self, engine, threads):
- pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine)
+ pd.eval("self.df * self.df2 * self.df3 * self.df4", engine=engine)
def teardown(self, engine, threads):
expr.set_numexpr_threads()
class Query:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- index = pd.date_range('20010101', periods=N, freq='T')
+ index = pd.date_range("20010101", periods=N, freq="T")
s = pd.Series(index)
self.ts = s.iloc[halfway]
- self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': index},
- index=index)
+ self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index)
data = np.random.randn(N)
self.min_val = data.min()
self.max_val = data.max()
def time_query_datetime_index(self):
- self.df.query('index < @self.ts')
+ self.df.query("index < @self.ts")
def time_query_datetime_column(self):
- self.df.query('dates < @self.ts')
+ self.df.query("dates < @self.ts")
def time_query_with_boolean_selection(self):
- self.df.query('(a >= @self.min_val) & (a <= @self.max_val)')
+ self.df.query("(a >= @self.min_val) & (a <= @self.max_val)")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
index 19c2a913e8494..acfb26bcf5d7c 100644
--- a/asv_bench/benchmarks/frame_ctor.py
+++ b/asv_bench/benchmarks/frame_ctor.py
@@ -1,25 +1,23 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range
+
try:
from pandas.tseries.offsets import Nano, Hour
except ImportError:
# For compatibility with older versions
- from pandas.core.datetools import * # noqa
+ from pandas.core.datetools import * # noqa
class FromDicts:
-
def setup(self):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
- frame = DataFrame(np.random.randn(N, K), index=self.index,
- columns=self.columns)
+ frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
- self.dict_list = frame.to_dict(orient='records')
- self.data2 = {i: {j: float(j) for j in range(100)}
- for i in range(2000)}
+ self.dict_list = frame.to_dict(orient="records")
+ self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
def time_list_of_dict(self):
DataFrame(self.dict_list)
@@ -42,7 +40,6 @@ def time_nested_dict_int64(self):
class FromSeries:
-
def setup(self):
mi = MultiIndex.from_product([range(100), range(100)])
self.s = Series(np.random.randn(10000), index=mi)
@@ -54,12 +51,12 @@ def time_mi_series(self):
class FromDictwithTimestamp:
params = [Nano(1), Hour(1)]
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- N = 10**3
+ N = 10 ** 3
np.random.seed(1234)
- idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N)
+ idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N)
df = DataFrame(np.random.randn(N, 10), index=idx)
self.d = df.to_dict()
@@ -70,7 +67,11 @@ def time_dict_with_timestamp_offsets(self, offset):
class FromRecords:
params = [None, 1000]
- param_names = ['nrows']
+ param_names = ["nrows"]
+
+ # Generators get exhausted on use, so run setup before every call
+ number = 1
+ repeat = (3, 250, 10)
def setup(self, nrows):
N = 100000
@@ -82,7 +83,6 @@ def time_frame_from_records_generator(self, nrows):
class FromNDArray:
-
def setup(self):
N = 100000
self.data = np.random.randn(N)
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 5b76eeba115a4..e2f6764c76eef 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -1,18 +1,17 @@
+import warnings
import string
import numpy as np
-from pandas import (
- DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range)
+from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range
import pandas.util.testing as tm
class GetNumericData:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 25))
- self.df['foo'] = 'bar'
- self.df['bar'] = 'baz'
+ self.df["foo"] = "bar"
+ self.df["bar"] = "baz"
self.df = self.df._consolidate()
def time_frame_get_numeric_data(self):
@@ -20,17 +19,17 @@ def time_frame_get_numeric_data(self):
class Lookup:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 8),
- columns=list('abcdefgh'))
- self.df['foo'] = 'bar'
+ self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh"))
+ self.df["foo"] = "bar"
self.row_labels = list(self.df.index[::10])[:900]
self.col_labels = list(self.df.columns) * 100
self.row_labels_all = np.array(
- list(self.df.index) * len(self.df.columns), dtype='object')
+ list(self.df.index) * len(self.df.columns), dtype="object"
+ )
self.col_labels_all = np.array(
- list(self.df.columns) * len(self.df.index), dtype='object')
+ list(self.df.columns) * len(self.df.index), dtype="object"
+ )
def time_frame_fancy_lookup(self):
self.df.lookup(self.row_labels, self.col_labels)
@@ -40,17 +39,21 @@ def time_frame_fancy_lookup_all(self):
class Reindex:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_reindex_axis0(self):
self.df.reindex(self.idx)
@@ -66,18 +69,22 @@ def time_reindex_upcast(self):
class Rename:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.dict_idx = {k: k for k in self.idx}
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_rename_single(self):
self.df.rename({0: 0})
@@ -103,19 +110,20 @@ def setup(self):
N = 1000
self.df = DataFrame(np.random.randn(N * 10, N))
self.df2 = DataFrame(np.random.randn(N * 50, 10))
- self.df3 = DataFrame(np.random.randn(N, 5 * N),
- columns=['C' + str(c) for c in range(N * 5)])
+ self.df3 = DataFrame(
+ np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)]
+ )
self.df4 = DataFrame(np.random.randn(N * 1000, 10))
- def time_iteritems(self):
+ def time_items(self):
# (monitor no-copying behaviour)
- if hasattr(self.df, '_item_cache'):
+ if hasattr(self.df, "_item_cache"):
self.df._item_cache.clear()
- for name, col in self.df.iteritems():
+ for name, col in self.df.items():
pass
- def time_iteritems_cached(self):
- for name, col in self.df.iteritems():
+ def time_items_cached(self):
+ for name, col in self.df.items():
pass
def time_iteritems_indexing(self):
@@ -192,7 +200,6 @@ def time_iterrows(self):
class ToString:
-
def setup(self):
self.df = DataFrame(np.random.randn(100, 10))
@@ -201,11 +208,10 @@ def time_to_string_floats(self):
class ToHTML:
-
def setup(self):
nrows = 500
self.df2 = DataFrame(np.random.randn(nrows, 10))
- self.df2[0] = period_range('2000', periods=nrows)
+ self.df2[0] = period_range("2000", periods=nrows)
self.df2[1] = range(nrows)
def time_to_html_mixed(self):
@@ -213,7 +219,6 @@ def time_to_html_mixed(self):
class Repr:
-
def setup(self):
nrows = 10000
data = np.random.randn(nrows, 10)
@@ -238,7 +243,6 @@ def time_frame_repr_wide(self):
class MaskBool:
-
def setup(self):
data = np.random.randn(1000, 500)
df = DataFrame(data)
@@ -254,9 +258,8 @@ def time_frame_mask_floats(self):
class Isnull:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df_no_null = DataFrame(np.random.randn(N, N))
sample = np.array([np.nan, 1.0])
@@ -267,8 +270,20 @@ def setup(self):
data = np.random.choice(sample, (N, N))
self.df_strings = DataFrame(data)
- sample = np.array([NaT, np.nan, None, np.datetime64('NaT'),
- np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd'])
+ sample = np.array(
+ [
+ NaT,
+ np.nan,
+ None,
+ np.datetime64("NaT"),
+ np.timedelta64("NaT"),
+ 0,
+ 1,
+ 2.0,
+ "",
+ "abcd",
+ ]
+ )
data = np.random.choice(sample, (N, N))
self.df_obj = DataFrame(data)
@@ -287,8 +302,8 @@ def time_isnull_obj(self):
class Fillna:
- params = ([True, False], ['pad', 'bfill'])
- param_names = ['inplace', 'method']
+ params = ([True, False], ["pad", "bfill"])
+ param_names = ["inplace", "method"]
def setup(self, inplace, method):
values = np.random.randn(10000, 100)
@@ -301,16 +316,17 @@ def time_frame_fillna(self, inplace, method):
class Dropna:
- params = (['all', 'any'], [0, 1])
- param_names = ['how', 'axis']
+ params = (["all", "any"], [0, 1])
+ param_names = ["how", "axis"]
def setup(self, how, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
- self.df.ix[50:1000, 20:50] = np.nan
- self.df.ix[2000:3000] = np.nan
- self.df.ix[:, 60:70] = np.nan
+ with warnings.catch_warnings(record=True):
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
def time_dropna(self, how, axis):
self.df.dropna(how=how, axis=axis)
@@ -322,23 +338,25 @@ def time_dropna_axis_mixed_dtypes(self, how, axis):
class Count:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
- self.df.ix[50:1000, 20:50] = np.nan
- self.df.ix[2000:3000] = np.nan
- self.df.ix[:, 60:70] = np.nan
+ with warnings.catch_warnings(record=True):
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index])
- self.df.columns = MultiIndex.from_arrays([self.df.columns,
- self.df.columns])
- self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index,
- self.df_mixed.index])
- self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns,
- self.df_mixed.columns])
+ self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns])
+ self.df_mixed.index = MultiIndex.from_arrays(
+ [self.df_mixed.index, self.df_mixed.index]
+ )
+ self.df_mixed.columns = MultiIndex.from_arrays(
+ [self.df_mixed.columns, self.df_mixed.columns]
+ )
def time_count_level_multi(self, axis):
self.df.count(axis=axis, level=1)
@@ -348,13 +366,12 @@ def time_count_level_mixed_dtypes_multi(self, axis):
class Apply:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
self.df2 = DataFrame({i: self.s for i in range(1028)})
- self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)])
@@ -372,11 +389,10 @@ def time_apply_pass_thru(self):
self.df.apply(lambda x: x)
def time_apply_ref_by_name(self):
- self.df3.apply(lambda x: x['A'] + x['B'], axis=1)
+ self.df3.apply(lambda x: x["A"] + x["B"], axis=1)
class Dtypes:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))
@@ -385,19 +401,18 @@ def time_frame_dtypes(self):
class Equals:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.float_df = DataFrame(np.random.randn(N, N))
self.float_df_nan = self.float_df.copy()
self.float_df_nan.iloc[-1, -1] = np.nan
- self.object_df = DataFrame('foo', index=range(N), columns=range(N))
+ self.object_df = DataFrame("foo", index=range(N), columns=range(N))
self.object_df_nan = self.object_df.copy()
self.object_df_nan.iloc[-1, -1] = np.nan
self.nonunique_cols = self.object_df.copy()
- self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns)
+ self.nonunique_cols.columns = ["A"] * len(self.nonunique_cols.columns)
self.nonunique_cols_nan = self.nonunique_cols.copy()
self.nonunique_cols_nan.iloc[-1, -1] = np.nan
@@ -422,8 +437,8 @@ def time_frame_object_unequal(self):
class Interpolate:
- params = [None, 'infer']
- param_names = ['downcast']
+ params = [None, "infer"]
+ param_names = ["downcast"]
def setup(self, downcast):
N = 10000
@@ -431,12 +446,16 @@ def setup(self, downcast):
self.df = DataFrame(np.random.randn(N, 100))
self.df.values[::2] = np.nan
- self.df2 = DataFrame({'A': np.arange(0, N),
- 'B': np.random.randint(0, 100, N),
- 'C': np.random.randn(N),
- 'D': np.random.randn(N)})
- self.df2.loc[1::5, 'A'] = np.nan
- self.df2.loc[1::5, 'C'] = np.nan
+ self.df2 = DataFrame(
+ {
+ "A": np.arange(0, N),
+ "B": np.random.randint(0, 100, N),
+ "C": np.random.randn(N),
+ "D": np.random.randn(N),
+ }
+ )
+ self.df2.loc[1::5, "A"] = np.nan
+ self.df2.loc[1::5, "C"] = np.nan
def time_interpolate(self, downcast):
self.df.interpolate(downcast=downcast)
@@ -448,7 +467,7 @@ def time_interpolate_some_good(self, downcast):
class Shift:
# frame shift speedup issue-5609
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.rand(10000, 500))
@@ -458,7 +477,6 @@ def time_shift(self, axis):
class Nunique:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 1000))
@@ -467,14 +485,17 @@ def time_frame_nunique(self):
class Duplicated:
-
def setup(self):
- n = (1 << 20)
- t = date_range('2015-01-01', freq='S', periods=(n // 64))
+ n = 1 << 20
+ t = date_range("2015-01-01", freq="S", periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
- self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
- 'b': np.random.choice(t, n),
- 'c': np.random.choice(xs, n)})
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(-1 << 8, 1 << 8, n),
+ "b": np.random.choice(t, n),
+ "c": np.random.choice(xs, n),
+ }
+ )
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
def time_frame_duplicated(self):
@@ -487,10 +508,10 @@ def time_frame_duplicated_wide(self):
class XS:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.N = 10**4
+ self.N = 10 ** 4
self.df = DataFrame(np.random.randn(self.N, self.N))
def time_frame_xs(self, axis):
@@ -500,35 +521,38 @@ def time_frame_xs(self, axis):
class SortValues:
params = [True, False]
- param_names = ['ascending']
+ param_names = ["ascending"]
def setup(self, ascending):
- self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))
+ self.df = DataFrame(np.random.randn(1000000, 2), columns=list("AB"))
def time_frame_sort_values(self, ascending):
- self.df.sort_values(by='A', ascending=ascending)
+ self.df.sort_values(by="A", ascending=ascending)
class SortIndexByColumns:
-
def setup(self):
N = 10000
K = 10
- self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
- 'key2': tm.makeStringIndex(N).values.repeat(K),
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {
+ "key1": tm.makeStringIndex(N).values.repeat(K),
+ "key2": tm.makeStringIndex(N).values.repeat(K),
+ "value": np.random.randn(N * K),
+ }
+ )
def time_frame_sort_values_by_columns(self):
- self.df.sort_values(by=['key1', 'key2'])
+ self.df.sort_values(by=["key1", "key2"])
class Quantile:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_frame_quantile(self, axis):
self.df.quantile([0.1, 0.5], axis=axis)
@@ -540,7 +564,8 @@ def setup(self):
self.df = DataFrame(np.random.randn(10, 10000))
def time_frame_get_dtype_counts(self):
- self.df.get_dtype_counts()
+ with warnings.catch_warnings(record=True):
+ self.df.get_dtype_counts()
def time_info(self):
self.df.info()
@@ -548,37 +573,37 @@ def time_info(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
- self.df = DataFrame(np.random.randn(100000, 3),
- columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC"))
def time_nlargest_one_column(self, keep):
- self.df.nlargest(100, 'A', keep=keep)
+ self.df.nlargest(100, "A", keep=keep)
def time_nlargest_two_columns(self, keep):
- self.df.nlargest(100, ['A', 'B'], keep=keep)
+ self.df.nlargest(100, ["A", "B"], keep=keep)
def time_nsmallest_one_column(self, keep):
- self.df.nsmallest(100, 'A', keep=keep)
+ self.df.nsmallest(100, "A", keep=keep)
def time_nsmallest_two_columns(self, keep):
- self.df.nsmallest(100, ['A', 'B'], keep=keep)
+ self.df.nsmallest(100, ["A", "B"], keep=keep)
class Describe:
-
def setup(self):
- self.df = DataFrame({
- 'a': np.random.randint(0, 100, int(1e6)),
- 'b': np.random.randint(0, 100, int(1e6)),
- 'c': np.random.randint(0, 100, int(1e6))
- })
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(0, 100, int(1e6)),
+ "b": np.random.randint(0, 100, int(1e6)),
+ "c": np.random.randint(0, 100, int(1e6)),
+ }
+ )
def time_series_describe(self):
- self.df['a'].describe()
+ self.df["a"].describe()
def time_dataframe_describe(self):
self.df.describe()
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
index 65a03bfda48c5..0d0b75561d057 100644
--- a/asv_bench/benchmarks/gil.py
+++ b/asv_bench/benchmarks/gil.py
@@ -2,9 +2,19 @@
import pandas.util.testing as tm
from pandas import DataFrame, Series, read_csv, factorize, date_range
from pandas.core.algorithms import take_1d
+
try:
- from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max,
- rolling_var, rolling_skew, rolling_kurt, rolling_std)
+ from pandas import (
+ rolling_median,
+ rolling_mean,
+ rolling_min,
+ rolling_max,
+ rolling_var,
+ rolling_skew,
+ rolling_kurt,
+ rolling_std,
+ )
+
have_rolling_methods = True
except ImportError:
have_rolling_methods = False
@@ -14,6 +24,7 @@
from pandas import algos
try:
from pandas.util.testing import test_parallel
+
have_real_test_parallel = True
except ImportError:
have_real_test_parallel = False
@@ -21,32 +32,36 @@
def test_parallel(num_threads=1):
def wrapper(fname):
return fname
+
return wrapper
+
from .pandas_vb_common import BaseIO
class ParallelGroupbyMethods:
- params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod',
- 'sum', 'var'])
- param_names = ['threads', 'method']
+ params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"])
+ param_names = ["threads", "method"]
def setup(self, threads, method):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- ngroups = 10**3
- df = DataFrame({'key': np.random.randint(0, ngroups, size=N),
- 'data': np.random.randn(N)})
+ N = 10 ** 6
+ ngroups = 10 ** 3
+ df = DataFrame(
+ {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)}
+ )
@test_parallel(num_threads=threads)
def parallel():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.parallel = parallel
def loop():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.loop = loop
def time_parallel(self, threads, method):
@@ -60,18 +75,19 @@ def time_loop(self, threads, method):
class ParallelGroups:
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
raise NotImplementedError
- size = 2**22
- ngroups = 10**3
+ size = 2 ** 22
+ ngroups = 10 ** 3
data = Series(np.random.randint(0, ngroups, size=size))
@test_parallel(num_threads=threads)
def get_groups():
data.groupby(data).groups
+
self.get_groups = get_groups
def time_get_groups(self, threads):
@@ -80,19 +96,20 @@ def time_get_groups(self, threads):
class ParallelTake1D:
- params = ['int64', 'float64']
- param_names = ['dtype']
+ params = ["int64", "float64"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- df = DataFrame({'col': np.arange(N, dtype=dtype)})
+ N = 10 ** 6
+ df = DataFrame({"col": np.arange(N, dtype=dtype)})
indexer = np.arange(100, len(df) - 100)
@test_parallel(num_threads=2)
def parallel_take1d():
- take_1d(df['col'].values, indexer)
+ take_1d(df["col"].values, indexer)
+
self.parallel_take1d = parallel_take1d
def time_take1d(self, dtype):
@@ -107,14 +124,14 @@ class ParallelKth:
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**7
- k = 5 * 10**5
- kwargs_list = [{'arr': np.random.randn(N)},
- {'arr': np.random.randn(N)}]
+ N = 10 ** 7
+ k = 5 * 10 ** 5
+ kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@test_parallel(num_threads=2, kwargs_list=kwargs_list)
def parallel_kth_smallest(arr):
algos.kth_smallest(arr, k)
+
self.parallel_kth_smallest = parallel_kth_smallest
def time_kth_smallest(self):
@@ -122,81 +139,90 @@ def time_kth_smallest(self):
class ParallelDatetimeFields:
-
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- self.dti = date_range('1900-01-01', periods=N, freq='T')
- self.period = self.dti.to_period('D')
+ N = 10 ** 6
+ self.dti = date_range("1900-01-01", periods=N, freq="T")
+ self.period = self.dti.to_period("D")
def time_datetime_field_year(self):
@test_parallel(num_threads=2)
def run(dti):
dti.year
+
run(self.dti)
def time_datetime_field_day(self):
@test_parallel(num_threads=2)
def run(dti):
dti.day
+
run(self.dti)
def time_datetime_field_daysinmonth(self):
@test_parallel(num_threads=2)
def run(dti):
dti.days_in_month
+
run(self.dti)
def time_datetime_field_normalize(self):
@test_parallel(num_threads=2)
def run(dti):
dti.normalize()
+
run(self.dti)
def time_datetime_to_period(self):
@test_parallel(num_threads=2)
def run(dti):
- dti.to_period('S')
+ dti.to_period("S")
+
run(self.dti)
def time_period_to_datetime(self):
@test_parallel(num_threads=2)
def run(period):
period.to_timestamp()
+
run(self.period)
class ParallelRolling:
- params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std']
- param_names = ['method']
+ params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"]
+ param_names = ["method"]
def setup(self, method):
if not have_real_test_parallel:
raise NotImplementedError
win = 100
arr = np.random.rand(100000)
- if hasattr(DataFrame, 'rolling'):
+ if hasattr(DataFrame, "rolling"):
df = DataFrame(arr).rolling(win)
@test_parallel(num_threads=2)
def parallel_rolling():
getattr(df, method)()
+
self.parallel_rolling = parallel_rolling
elif have_rolling_methods:
- rolling = {'median': rolling_median,
- 'mean': rolling_mean,
- 'min': rolling_min,
- 'max': rolling_max,
- 'var': rolling_var,
- 'skew': rolling_skew,
- 'kurt': rolling_kurt,
- 'std': rolling_std}
+ rolling = {
+ "median": rolling_median,
+ "mean": rolling_mean,
+ "min": rolling_min,
+ "max": rolling_max,
+ "var": rolling_var,
+ "skew": rolling_skew,
+ "kurt": rolling_kurt,
+ "std": rolling_std,
+ }
@test_parallel(num_threads=2)
def parallel_rolling():
rolling[method](arr, win)
+
self.parallel_rolling = parallel_rolling
else:
raise NotImplementedError
@@ -209,30 +235,34 @@ class ParallelReadCSV(BaseIO):
number = 1
repeat = 5
- params = ['float', 'object', 'datetime']
- param_names = ['dtype']
+ params = ["float", "object", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
rows = 10000
cols = 50
- data = {'float': DataFrame(np.random.randn(rows, cols)),
- 'datetime': DataFrame(np.random.randn(rows, cols),
- index=date_range('1/1/2000',
- periods=rows)),
- 'object': DataFrame('foo',
- index=range(rows),
- columns=['object%03d'.format(i)
- for i in range(5)])}
-
- self.fname = '__test_{}__.csv'.format(dtype)
+ data = {
+ "float": DataFrame(np.random.randn(rows, cols)),
+ "datetime": DataFrame(
+ np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
+ ),
+ "object": DataFrame(
+ "foo",
+ index=range(rows),
+ columns=["object%03d".format(i) for i in range(5)],
+ ),
+ }
+
+ self.fname = "__test_{}__.csv".format(dtype)
df = data[dtype]
df.to_csv(self.fname)
@test_parallel(num_threads=2)
def parallel_read_csv():
read_csv(self.fname)
+
self.parallel_read_csv = parallel_read_csv
def time_read_csv(self, dtype):
@@ -244,7 +274,7 @@ class ParallelFactorize:
number = 1
repeat = 5
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
@@ -255,10 +285,12 @@ def setup(self, threads):
@test_parallel(num_threads=threads)
def parallel():
factorize(strings)
+
self.parallel = parallel
def loop():
factorize(strings)
+
self.loop = loop
def time_parallel(self, threads):
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 3097ada6d2022..39b07d4734399 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -5,18 +5,55 @@
import numpy as np
from pandas import (
- Categorical, DataFrame, MultiIndex, Series, Timestamp,
- date_range, period_range)
+ Categorical,
+ DataFrame,
+ MultiIndex,
+ Series,
+ Timestamp,
+ date_range,
+ period_range,
+)
import pandas.util.testing as tm
method_blacklist = {
- 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
- 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
- 'var', 'mad', 'describe', 'std', 'quantile'},
- 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
- 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
- 'std'}
+ "object": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "cummin",
+ "mean",
+ "max",
+ "skew",
+ "cumprod",
+ "cummax",
+ "rank",
+ "pct_change",
+ "min",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ "quantile",
+ },
+ "datetime": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "mean",
+ "skew",
+ "cumprod",
+ "cummax",
+ "pct_change",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ },
}
@@ -26,28 +63,31 @@ def setup(self):
self.data = Series(np.random.randn(len(self.labels)))
def time_groupby_apply_dict_return(self):
- self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0],
- 'last': x.values[-1]})
+ self.data.groupby(self.labels).apply(
+ lambda x: {"first": x.values[0], "last": x.values[-1]}
+ )
class Apply:
-
def setup_cache(self):
- N = 10**4
+ N = 10 ** 4
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
- df = DataFrame({'key': labels,
- 'key2': labels2,
- 'value1': np.random.randn(N),
- 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)
- })
+ df = DataFrame(
+ {
+ "key": labels,
+ "key2": labels2,
+ "value1": np.random.randn(N),
+ "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
+ }
+ )
return df
def time_scalar_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(lambda x: 1)
+ df.groupby(["key", "key2"]).apply(lambda x: 1)
def time_scalar_function_single_col(self, df):
- df.groupby('key').apply(lambda x: 1)
+ df.groupby("key").apply(lambda x: 1)
@staticmethod
def df_copy_function(g):
@@ -56,27 +96,29 @@ def df_copy_function(g):
return g.copy()
def time_copy_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(self.df_copy_function)
+ df.groupby(["key", "key2"]).apply(self.df_copy_function)
def time_copy_overhead_single_col(self, df):
- df.groupby('key').apply(self.df_copy_function)
+ df.groupby("key").apply(self.df_copy_function)
class Groups:
- param_names = ['key']
- params = ['int64_small', 'int64_large', 'object_small', 'object_large']
+ param_names = ["key"]
+ params = ["int64_small", "int64_large", "object_small", "object_large"]
def setup_cache(self):
- size = 10**6
- data = {'int64_small': Series(np.random.randint(0, 100, size=size)),
- 'int64_large': Series(np.random.randint(0, 10000, size=size)),
- 'object_small': Series(
- tm.makeStringIndex(100).take(
- np.random.randint(0, 100, size=size))),
- 'object_large': Series(
- tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=size)))}
+ size = 10 ** 6
+ data = {
+ "int64_small": Series(np.random.randint(0, 100, size=size)),
+ "int64_large": Series(np.random.randint(0, 10000, size=size)),
+ "object_small": Series(
+ tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
+ ),
+ "object_large": Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
+ ),
+ }
return data
def setup(self, data, key):
@@ -89,7 +131,7 @@ def time_series_groups(self, data, key):
class GroupManyLabels:
params = [1, 1000]
- param_names = ['ncols']
+ param_names = ["ncols"]
def setup(self, ncols):
N = 1000
@@ -103,46 +145,45 @@ def time_sum(self, ncols):
class Nth:
- param_names = ['dtype']
- params = ['float32', 'float64', 'datetime', 'object']
+ param_names = ["dtype"]
+ params = ["float32", "float64", "datetime", "object"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
# with datetimes (GH7555)
- if dtype == 'datetime':
- values = date_range('1/1/2011', periods=N, freq='s')
- elif dtype == 'object':
- values = ['foo'] * N
+ if dtype == "datetime":
+ values = date_range("1/1/2011", periods=N, freq="s")
+ elif dtype == "object":
+ values = ["foo"] * N
else:
values = np.arange(N).astype(dtype)
key = np.arange(N)
- self.df = DataFrame({'key': key, 'values': values})
+ self.df = DataFrame({"key": key, "values": values})
self.df.iloc[1, 1] = np.nan # insert missing data
def time_frame_nth_any(self, dtype):
- self.df.groupby('key').nth(0, dropna='any')
+ self.df.groupby("key").nth(0, dropna="any")
def time_groupby_nth_all(self, dtype):
- self.df.groupby('key').nth(0, dropna='all')
+ self.df.groupby("key").nth(0, dropna="all")
def time_frame_nth(self, dtype):
- self.df.groupby('key').nth(0)
+ self.df.groupby("key").nth(0)
def time_series_nth_any(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="any")
def time_series_nth_all(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="all")
def time_series_nth(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0)
+ self.df["values"].groupby(self.df["key"]).nth(0)
class DateAttributes:
-
def setup(self):
- rng = date_range('1/1/2000', '12/31/2005', freq='H')
+ rng = date_range("1/1/2000", "12/31/2005", freq="H")
self.year, self.month, self.day = rng.year, rng.month, rng.day
self.ts = Series(np.random.randn(len(rng)), index=rng)
@@ -151,154 +192,167 @@ def time_len_groupby_object(self):
class Int64:
-
def setup(self):
arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5))
i = np.random.choice(len(arr), len(arr) * 5)
arr = np.vstack((arr, arr[i]))
i = np.random.permutation(len(arr))
arr = arr[i]
- self.cols = list('abcde')
+ self.cols = list("abcde")
self.df = DataFrame(arr, columns=self.cols)
- self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10
+ self.df["jim"], self.df["joe"] = np.random.randn(2, len(self.df)) * 10
def time_overflow(self):
self.df.groupby(self.cols).max()
class CountMultiDtype:
-
def setup_cache(self):
n = 10000
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- dates[np.random.rand(n) > 0.5] = np.datetime64('nat')
- offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat')
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ dates[np.random.rand(n) > 0.5] = np.datetime64("nat")
+ offsets[np.random.rand(n) > 0.5] = np.timedelta64("nat")
value2 = np.random.randn(n)
value2[np.random.rand(n) > 0.5] = np.nan
- obj = np.random.choice(list('ab'), size=n).astype(object)
+ obj = np.random.choice(list("ab"), size=n).astype(object)
obj[np.random.randn(n) > 0.5] = np.nan
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'dates': dates,
- 'value2': value2,
- 'value3': np.random.randn(n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'obj': obj,
- 'offsets': offsets})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "dates": dates,
+ "value2": value2,
+ "value3": np.random.randn(n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "obj": obj,
+ "offsets": offsets,
+ }
+ )
return df
def time_multi_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
class CountMultiInt:
-
def setup_cache(self):
n = 10000
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'ints2': np.random.randint(0, 1000, size=n)})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "ints2": np.random.randint(0, 1000, size=n),
+ }
+ )
return df
def time_multi_int_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
def time_multi_int_nunique(self, df):
- df.groupby(['key1', 'key2']).nunique()
+ df.groupby(["key1", "key2"]).nunique()
class AggFunctions:
-
def setup_cache(self):
- N = 10**5
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
- df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)),
- 'key2': fac2.take(np.random.randint(0, 2, size=N)),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
+ N = 10 ** 5
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
+ df = DataFrame(
+ {
+ "key1": fac1.take(np.random.randint(0, 3, size=N)),
+ "key2": fac2.take(np.random.randint(0, 2, size=N)),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
return df
def time_different_str_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': 'mean',
- 'value2': 'var',
- 'value3': 'sum'})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": "mean", "value2": "var", "value3": "sum"}
+ )
def time_different_numpy_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': np.mean,
- 'value2': np.var,
- 'value3': np.sum})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": np.mean, "value2": np.var, "value3": np.sum}
+ )
def time_different_python_functions_multicol(self, df):
- df.groupby(['key1', 'key2']).agg([sum, min, max])
+ df.groupby(["key1", "key2"]).agg([sum, min, max])
def time_different_python_functions_singlecol(self, df):
- df.groupby('key1').agg([sum, min, max])
+ df.groupby("key1").agg([sum, min, max])
class GroupStrings:
-
def setup(self):
- n = 2 * 10**5
- alpha = list(map(''.join, product(ascii_letters, repeat=4)))
+ n = 2 * 10 ** 5
+ alpha = list(map("".join, product(ascii_letters, repeat=4)))
data = np.random.choice(alpha, (n // 5, 4), replace=False)
data = np.repeat(data, 5, axis=0)
- self.df = DataFrame(data, columns=list('abcd'))
- self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
+ self.df = DataFrame(data, columns=list("abcd"))
+ self.df["joe"] = (np.random.randn(len(self.df)) * 10).round(3)
self.df = self.df.sample(frac=1).reset_index(drop=True)
def time_multi_columns(self):
- self.df.groupby(list('abcd')).max()
+ self.df.groupby(list("abcd")).max()
class MultiColumn:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
key1 = np.tile(np.arange(100, dtype=object), 1000)
key2 = key1.copy()
np.random.shuffle(key1)
np.random.shuffle(key2)
- df = DataFrame({'key1': key1,
- 'key2': key2,
- 'data1': np.random.randn(N),
- 'data2': np.random.randn(N)})
+ df = DataFrame(
+ {
+ "key1": key1,
+ "key2": key2,
+ "data1": np.random.randn(N),
+ "data2": np.random.randn(N),
+ }
+ )
return df
def time_lambda_sum(self, df):
- df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"]).agg(lambda x: x.values.sum())
def time_cython_sum(self, df):
- df.groupby(['key1', 'key2']).sum()
+ df.groupby(["key1", "key2"]).sum()
def time_col_select_lambda_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum())
def time_col_select_numpy_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(np.sum)
+ df.groupby(["key1", "key2"])["data1"].agg(np.sum)
class Size:
-
def setup(self):
- n = 10**5
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'value1': np.random.randn(n),
- 'value2': np.random.randn(n),
- 'value3': np.random.randn(n),
- 'dates': dates})
+ n = 10 ** 5
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ self.df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "value1": np.random.randn(n),
+ "value2": np.random.randn(n),
+ "value3": np.random.randn(n),
+ "dates": dates,
+ }
+ )
self.draws = Series(np.random.randn(n))
- labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
- self.cats = labels.astype('category')
+ labels = Series(["foo", "bar", "baz", "qux"] * (n // 4))
+ self.cats = labels.astype("category")
def time_multi_size(self):
- self.df.groupby(['key1', 'key2']).size()
+ self.df.groupby(["key1", "key2"]).size()
def time_category_size(self):
self.draws.groupby(self.cats).size()
@@ -306,15 +360,47 @@ def time_category_size(self):
class GroupByMethods:
- param_names = ['dtype', 'method', 'application']
- params = [['int', 'float', 'object', 'datetime'],
- ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
- 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
- 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
- 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift',
- 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts',
- 'var'],
- ['direct', 'transformation']]
+ param_names = ["dtype", "method", "application"]
+ params = [
+ ["int", "float", "object", "datetime"],
+ [
+ "all",
+ "any",
+ "bfill",
+ "count",
+ "cumcount",
+ "cummax",
+ "cummin",
+ "cumprod",
+ "cumsum",
+ "describe",
+ "ffill",
+ "first",
+ "head",
+ "last",
+ "mad",
+ "max",
+ "min",
+ "median",
+ "mean",
+ "nunique",
+ "pct_change",
+ "prod",
+ "quantile",
+ "rank",
+ "sem",
+ "shift",
+ "size",
+ "skew",
+ "std",
+ "sum",
+ "tail",
+ "unique",
+ "value_counts",
+ "var",
+ ],
+ ["direct", "transformation"],
+ ]
def setup(self, dtype, method, application):
if method in method_blacklist.get(dtype, {}):
@@ -323,29 +409,28 @@ def setup(self, dtype, method, application):
size = ngroups * 2
rng = np.arange(ngroups)
values = rng.take(np.random.randint(0, ngroups, size=size))
- if dtype == 'int':
+ if dtype == "int":
key = np.random.randint(0, size, size=size)
- elif dtype == 'float':
- key = np.concatenate([np.random.random(ngroups) * 0.1,
- np.random.random(ngroups) * 10.0])
- elif dtype == 'object':
- key = ['foo'] * size
- elif dtype == 'datetime':
- key = date_range('1/1/2011', periods=size, freq='s')
-
- df = DataFrame({'values': values, 'key': key})
-
- if application == 'transform':
- if method == 'describe':
+ elif dtype == "float":
+ key = np.concatenate(
+ [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]
+ )
+ elif dtype == "object":
+ key = ["foo"] * size
+ elif dtype == "datetime":
+ key = date_range("1/1/2011", periods=size, freq="s")
+
+ df = DataFrame({"values": values, "key": key})
+
+ if application == "transform":
+ if method == "describe":
raise NotImplementedError
- self.as_group_method = lambda: df.groupby(
- 'key')['values'].transform(method)
- self.as_field_method = lambda: df.groupby(
- 'values')['key'].transform(method)
+ self.as_group_method = lambda: df.groupby("key")["values"].transform(method)
+ self.as_field_method = lambda: df.groupby("values")["key"].transform(method)
else:
- self.as_group_method = getattr(df.groupby('key')['values'], method)
- self.as_field_method = getattr(df.groupby('values')['key'], method)
+ self.as_group_method = getattr(df.groupby("key")["values"], method)
+ self.as_field_method = getattr(df.groupby("values")["key"], method)
def time_dtype_as_group(self, dtype, method, application):
self.as_group_method()
@@ -356,20 +441,22 @@ def time_dtype_as_field(self, dtype, method, application):
class RankWithTies:
# GH 21237
- param_names = ['dtype', 'tie_method']
- params = [['float64', 'float32', 'int64', 'datetime64'],
- ['first', 'average', 'dense', 'min', 'max']]
+ param_names = ["dtype", "tie_method"]
+ params = [
+ ["float64", "float32", "int64", "datetime64"],
+ ["first", "average", "dense", "min", "max"],
+ ]
def setup(self, dtype, tie_method):
- N = 10**4
- if dtype == 'datetime64':
+ N = 10 ** 4
+ if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
- self.df = DataFrame({'values': data, 'key': ['foo'] * N})
+ self.df = DataFrame({"values": data, "key": ["foo"] * N})
def time_rank_ties(self, dtype, tie_method):
- self.df.groupby('key').rank(method=tie_method)
+ self.df.groupby("key").rank(method=tie_method)
class Float32:
@@ -382,57 +469,61 @@ def setup(self):
self.df = DataFrame(dict(a=arr, b=arr))
def time_sum(self):
- self.df.groupby(['a'])['b'].sum()
+ self.df.groupby(["a"])["b"].sum()
class Categories:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
arr = np.random.random(N)
- data = {'a': Categorical(np.random.randint(10000, size=N)),
- 'b': arr}
+ data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr}
self.df = DataFrame(data)
- data = {'a': Categorical(np.random.randint(10000, size=N),
- ordered=True),
- 'b': arr}
+ data = {
+ "a": Categorical(np.random.randint(10000, size=N), ordered=True),
+ "b": arr,
+ }
self.df_ordered = DataFrame(data)
- data = {'a': Categorical(np.random.randint(100, size=N),
- categories=np.arange(10000)),
- 'b': arr}
+ data = {
+ "a": Categorical(
+ np.random.randint(100, size=N), categories=np.arange(10000)
+ ),
+ "b": arr,
+ }
self.df_extra_cat = DataFrame(data)
def time_groupby_sort(self):
- self.df.groupby('a')['b'].count()
+ self.df.groupby("a")["b"].count()
def time_groupby_nosort(self):
- self.df.groupby('a', sort=False)['b'].count()
+ self.df.groupby("a", sort=False)["b"].count()
def time_groupby_ordered_sort(self):
- self.df_ordered.groupby('a')['b'].count()
+ self.df_ordered.groupby("a")["b"].count()
def time_groupby_ordered_nosort(self):
- self.df_ordered.groupby('a', sort=False)['b'].count()
+ self.df_ordered.groupby("a", sort=False)["b"].count()
def time_groupby_extra_cat_sort(self):
- self.df_extra_cat.groupby('a')['b'].count()
+ self.df_extra_cat.groupby("a")["b"].count()
def time_groupby_extra_cat_nosort(self):
- self.df_extra_cat.groupby('a', sort=False)['b'].count()
+ self.df_extra_cat.groupby("a", sort=False)["b"].count()
class Datelike:
# GH 14338
- params = ['period_range', 'date_range', 'date_range_tz']
- param_names = ['grouper']
+ params = ["period_range", "date_range", "date_range_tz"]
+ param_names = ["grouper"]
def setup(self, grouper):
- N = 10**4
- rng_map = {'period_range': period_range,
- 'date_range': date_range,
- 'date_range_tz': partial(date_range, tz='US/Central')}
- self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N)
- self.df = DataFrame(np.random.randn(10**4, 2))
+ N = 10 ** 4
+ rng_map = {
+ "period_range": period_range,
+ "date_range": date_range,
+ "date_range_tz": partial(date_range, tz="US/Central"),
+ }
+ self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N)
+ self.df = DataFrame(np.random.randn(10 ** 4, 2))
def time_sum(self, grouper):
self.df.groupby(self.grouper).sum()
@@ -442,11 +533,10 @@ class SumBools:
# GH 2692
def setup(self):
N = 500
- self.df = DataFrame({'ii': range(N),
- 'bb': [True] * N})
+ self.df = DataFrame({"ii": range(N), "bb": [True] * N})
def time_groupby_sum_booleans(self):
- self.df.groupby('ii').sum()
+ self.df.groupby("ii").sum()
class SumMultiLevel:
@@ -455,84 +545,85 @@ class SumMultiLevel:
def setup(self):
N = 50
- self.df = DataFrame({'A': list(range(N)) * 2,
- 'B': range(N * 2),
- 'C': 1}).set_index(['A', 'B'])
+ self.df = DataFrame(
+ {"A": list(range(N)) * 2, "B": range(N * 2), "C": 1}
+ ).set_index(["A", "B"])
def time_groupby_sum_multiindex(self):
self.df.groupby(level=[0, 1]).sum()
class Transform:
-
def setup(self):
n1 = 400
n2 = 250
- index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)],
- codes=[np.repeat(range(n1), n2).tolist(),
- list(range(n2)) * n1],
- names=['lev1', 'lev2'])
+ index = MultiIndex(
+ levels=[np.arange(n1), tm.makeStringIndex(n2)],
+ codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
+ names=["lev1", "lev2"],
+ )
arr = np.random.randn(n1 * n2, 3)
arr[::10000, 0] = np.nan
arr[1::10000, 1] = np.nan
arr[2::10000, 2] = np.nan
- data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3'])
+ data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"])
self.df = data
n = 20000
- self.df1 = DataFrame(np.random.randint(1, n, (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df1 = DataFrame(
+ np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df2 = self.df1.copy()
- self.df2['jim'] = self.df2['joe']
+ self.df2["jim"] = self.df2["joe"]
- self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df3 = DataFrame(
+ np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df4 = self.df3.copy()
- self.df4['jim'] = self.df4['joe']
+ self.df4["jim"] = self.df4["joe"]
def time_transform_lambda_max(self):
- self.df.groupby(level='lev1').transform(lambda x: max(x))
+ self.df.groupby(level="lev1").transform(lambda x: max(x))
def time_transform_ufunc_max(self):
- self.df.groupby(level='lev1').transform(np.max)
+ self.df.groupby(level="lev1").transform(np.max)
def time_transform_multi_key1(self):
- self.df1.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df1.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key2(self):
- self.df2.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df2.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key3(self):
- self.df3.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df3.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key4(self):
- self.df4.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df4.groupby(["jim", "joe"])["jolie"].transform("max")
class TransformBools:
-
def setup(self):
N = 120000
transition_points = np.sort(np.random.choice(np.arange(N), 1400))
transitions = np.zeros(N, dtype=np.bool)
transitions[transition_points] = True
self.g = transitions.cumsum()
- self.df = DataFrame({'signal': np.random.rand(N)})
+ self.df = DataFrame({"signal": np.random.rand(N)})
def time_transform_mean(self):
- self.df['signal'].groupby(self.g).transform(np.mean)
+ self.df["signal"].groupby(self.g).transform(np.mean)
class TransformNaN:
# GH 12737
def setup(self):
- self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10),
- 'B': np.nan,
- 'C': np.nan})
- self.df_nans.loc[4::10, 'B':'C'] = 5
+ self.df_nans = DataFrame(
+ {"key": np.repeat(np.arange(1000), 10), "B": np.nan, "C": np.nan}
+ )
+ self.df_nans.loc[4::10, "B":"C"] = 5
def time_first(self):
- self.df_nans.groupby('key').transform('first')
+ self.df_nans.groupby("key").transform("first")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py
new file mode 100644
index 0000000000000..13b33855569c9
--- /dev/null
+++ b/asv_bench/benchmarks/index_cached_properties.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+
+class IndexCache:
+ number = 1
+ repeat = (3, 100, 20)
+
+ params = [
+ [
+ "DatetimeIndex",
+ "Float64Index",
+ "IntervalIndex",
+ "Int64Index",
+ "MultiIndex",
+ "PeriodIndex",
+ "RangeIndex",
+ "TimedeltaIndex",
+ "UInt64Index",
+ ]
+ ]
+ param_names = ["index_type"]
+
+ def setup(self, index_type):
+ N = 10 ** 5
+ if index_type == "MultiIndex":
+ self.idx = pd.MultiIndex.from_product(
+ [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]]
+ )
+ elif index_type == "DatetimeIndex":
+ self.idx = pd.date_range("1/1/2000", freq="T", periods=N)
+ elif index_type == "Int64Index":
+ self.idx = pd.Index(range(N))
+ elif index_type == "PeriodIndex":
+ self.idx = pd.period_range("1/1/2000", freq="T", periods=N)
+ elif index_type == "RangeIndex":
+ self.idx = pd.RangeIndex(start=0, stop=N)
+ elif index_type == "IntervalIndex":
+ self.idx = pd.IntervalIndex.from_arrays(range(N), range(1, N + 1))
+ elif index_type == "TimedeltaIndex":
+ self.idx = pd.TimedeltaIndex(range(N))
+ elif index_type == "Float64Index":
+ self.idx = pd.Float64Index(range(N))
+ elif index_type == "UInt64Index":
+ self.idx = pd.UInt64Index(range(N))
+ else:
+ raise ValueError
+ assert len(self.idx) == N
+ self.idx._cache = {}
+
+ def time_values(self, index_type):
+ self.idx._values
+
+ def time_shape(self, index_type):
+ self.idx.shape
+
+ def time_is_monotonic(self, index_type):
+ self.idx.is_monotonic
+
+ def time_is_monotonic_decreasing(self, index_type):
+ self.idx.is_monotonic_decreasing
+
+ def time_is_monotonic_increasing(self, index_type):
+ self.idx.is_monotonic_increasing
+
+ def time_is_unique(self, index_type):
+ self.idx.is_unique
+
+ def time_engine(self, index_type):
+ self.idx._engine
+
+ def time_inferred_type(self, index_type):
+ self.idx.inferred_type
+
+ def time_is_all_dates(self, index_type):
+ self.idx.is_all_dates
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
index d0aced87c54c6..49834ae94cc38 100644
--- a/asv_bench/benchmarks/index_object.py
+++ b/asv_bench/benchmarks/index_object.py
@@ -1,38 +1,48 @@
+import gc
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex,
- Float64Index, IntervalIndex)
+from pandas import (
+ Series,
+ date_range,
+ DatetimeIndex,
+ Index,
+ RangeIndex,
+ Float64Index,
+ IntervalIndex,
+)
class SetOperations:
- params = (['datetime', 'date_string', 'int', 'strings'],
- ['intersection', 'union', 'symmetric_difference'])
- param_names = ['dtype', 'method']
+ params = (
+ ["datetime", "date_string", "int", "strings"],
+ ["intersection", "union", "symmetric_difference"],
+ )
+ param_names = ["dtype", "method"]
def setup(self, dtype, method):
- N = 10**5
- dates_left = date_range('1/1/2000', periods=N, freq='T')
- fmt = '%Y-%m-%d %H:%M:%S'
+ N = 10 ** 5
+ dates_left = date_range("1/1/2000", periods=N, freq="T")
+ fmt = "%Y-%m-%d %H:%M:%S"
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
str_left = tm.makeStringIndex(N)
- data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]},
- 'date_string': {'left': date_str_left,
- 'right': date_str_left[:-1]},
- 'int': {'left': int_left, 'right': int_left[:-1]},
- 'strings': {'left': str_left, 'right': str_left[:-1]}}
- self.left = data[dtype]['left']
- self.right = data[dtype]['right']
+ data = {
+ "datetime": {"left": dates_left, "right": dates_left[:-1]},
+ "date_string": {"left": date_str_left, "right": date_str_left[:-1]},
+ "int": {"left": int_left, "right": int_left[:-1]},
+ "strings": {"left": str_left, "right": str_left[:-1]},
+ }
+ self.left = data[dtype]["left"]
+ self.right = data[dtype]["right"]
def time_operation(self, dtype, method):
getattr(self.left, method)(self.right)
class SetDisjoint:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
B = N + 20000
self.datetime_left = DatetimeIndex(range(N))
self.datetime_right = DatetimeIndex(range(N, B))
@@ -42,9 +52,8 @@ def time_datetime_difference_disjoint(self):
class Datetime:
-
def setup(self):
- self.dr = date_range('20000101', freq='D', periods=10000)
+ self.dr = date_range("20000101", freq="D", periods=10000)
def time_is_dates_only(self):
self.dr._is_dates_only
@@ -52,12 +61,12 @@ def time_is_dates_only(self):
class Ops:
- params = ['float', 'int']
- param_names = ['dtype']
+ params = ["float", "int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'}
+ N = 10 ** 6
+ indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
self.index = getattr(tm, indexes[dtype])(N)
def time_add(self, dtype):
@@ -77,10 +86,9 @@ def time_modulo(self, dtype):
class Range:
-
def setup(self):
- self.idx_inc = RangeIndex(start=0, stop=10**7, step=3)
- self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3)
+ self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)
+ self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3)
def time_max(self):
self.idx_inc.max()
@@ -102,7 +110,6 @@ def time_get_loc_dec(self):
class IndexAppend:
-
def setup(self):
N = 10000
@@ -132,19 +139,20 @@ def time_append_obj_list(self):
class Indexing:
- params = ['String', 'Float', 'Int']
- param_names = ['dtype']
+ params = ["String", "Float", "Int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- self.idx = getattr(tm, 'make{}Index'.format(dtype))(N)
+ N = 10 ** 6
+ self.idx = getattr(tm, "make{}Index".format(dtype))(N)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
- self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half])
- .sort_values())
+ self.non_unique_sorted = (
+ self.sorted[:half].append(self.sorted[:half]).sort_values()
+ )
self.key = self.sorted[N // 4]
def time_boolean_array(self, dtype):
@@ -188,7 +196,7 @@ def time_get_loc(self):
class IntervalIndexMethod:
# GH 24813
- params = [10**3, 10**5]
+ params = [10 ** 3, 10 ** 5]
def setup(self, N):
left = np.append(np.arange(N), np.array(0))
@@ -196,6 +204,9 @@ def setup(self, N):
self.intv = IntervalIndex.from_arrays(left, right)
self.intv._engine
+ self.intv2 = IntervalIndex.from_arrays(left + 1, right + 1)
+ self.intv2._engine
+
self.left = IntervalIndex.from_breaks(np.arange(N))
self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3))
@@ -208,8 +219,28 @@ def time_is_unique(self, N):
def time_intersection(self, N):
self.left.intersection(self.right)
- def time_intersection_duplicate(self, N):
+ def time_intersection_one_duplicate(self, N):
self.intv.intersection(self.right)
+ def time_intersection_both_duplicate(self, N):
+ self.intv.intersection(self.intv2)
+
+
+class GC:
+ params = [1, 2, 5]
+
+ def create_use_drop(self):
+ idx = Index(list(range(1000 * 1000)))
+ idx._engine
+
+ def peakmem_gc_instances(self, N):
+ try:
+ gc.disable()
+
+ for _ in range(N):
+ self.create_use_drop()
+ finally:
+ gc.enable()
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 4c932cf3600e8..84604b8196536 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -2,26 +2,38 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex,
- Int64Index, UInt64Index, Float64Index,
- IntervalIndex, CategoricalIndex,
- IndexSlice, concat, date_range)
+from pandas import (
+ Series,
+ DataFrame,
+ MultiIndex,
+ Int64Index,
+ UInt64Index,
+ Float64Index,
+ IntervalIndex,
+ CategoricalIndex,
+ IndexSlice,
+ concat,
+ date_range,
+ option_context,
+ period_range,
+)
class NumericSeriesIndexing:
params = [
(Int64Index, UInt64Index, Float64Index),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
+ N = 10 ** 6
indices = {
- 'unique_monotonic_inc': index(range(N)),
- 'nonunique_monotonic_inc': index(
- list(range(55)) + [54] + list(range(55, N - 1))),
+ "unique_monotonic_inc": index(range(N)),
+ "nonunique_monotonic_inc": index(
+ list(range(55)) + [54] + list(range(55, N - 1))
+ ),
}
self.data = Series(np.random.rand(N), index=indices[index_structure])
self.array = np.arange(10000)
@@ -55,16 +67,20 @@ def time_iloc_slice(self, index, index_structure):
self.data.iloc[:800000]
def time_ix_array(self, index, index_structure):
- self.data.ix[self.array]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[self.array]
def time_ix_list_like(self, index, index_structure):
- self.data.ix[[800000]]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[[800000]]
def time_ix_scalar(self, index, index_structure):
- self.data.ix[800000]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[800000]
def time_ix_slice(self, index, index_structure):
- self.data.ix[:800000]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[:800000]
def time_loc_array(self, index, index_structure):
self.data.loc[self.array]
@@ -82,31 +98,37 @@ def time_loc_slice(self, index, index_structure):
class NonNumericSeriesIndexing:
params = [
- ('string', 'datetime'),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("string", "datetime", "period"),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
- indexes = {'string': tm.makeStringIndex(N),
- 'datetime': date_range('1900', periods=N, freq='s')}
- index = indexes[index]
- if index_structure == 'nonunique_monotonic_inc':
+ N = 10 ** 6
+ if index == "string":
+ index = tm.makeStringIndex(N)
+ elif index == "datetime":
+ index = date_range("1900", periods=N, freq="s")
+ elif index == "period":
+ index = period_range("1900", periods=N, freq="s")
+ index = index.sort_values()
+ assert index.is_unique and index.is_monotonic_increasing
+ if index_structure == "nonunique_monotonic_inc":
index = index.insert(item=index[2], loc=2)[:-1]
+ elif index_structure == "non_monotonic":
+ index = index[::2].append(index[1::2])
+ assert len(index) == N
self.s = Series(np.random.rand(N), index=index)
self.lbl = index[80000]
+ # warm up index mapping
+ self.s[self.lbl]
def time_getitem_label_slice(self, index, index_structure):
- self.s[:self.lbl]
+ self.s[: self.lbl]
def time_getitem_pos_slice(self, index, index_structure):
self.s[:80000]
- def time_get_value(self, index, index_structure):
- with warnings.catch_warnings(record=True):
- self.s.get_value(self.lbl)
-
def time_getitem_scalar(self, index, index_structure):
self.s[self.lbl]
@@ -115,23 +137,19 @@ def time_getitem_list_like(self, index, index_structure):
class DataFrameStringIndexing:
-
def setup(self):
index = tm.makeStringIndex(1000)
columns = tm.makeStringIndex(30)
- self.df = DataFrame(np.random.randn(1000, 30), index=index,
- columns=columns)
+ with warnings.catch_warnings(record=True):
+ self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
self.col_scalar = columns[10]
self.bool_indexer = self.df[self.col_scalar] > 0
self.bool_obj_indexer = self.bool_indexer.astype(object)
- def time_get_value(self):
- with warnings.catch_warnings(record=True):
- self.df.get_value(self.idx_scalar, self.col_scalar)
-
def time_ix(self):
- self.df.ix[self.idx_scalar, self.col_scalar]
+ with warnings.catch_warnings(record=True):
+ self.df.ix[self.idx_scalar, self.col_scalar]
def time_loc(self):
self.df.loc[self.idx_scalar, self.col_scalar]
@@ -147,7 +165,6 @@ def time_boolean_rows_object(self):
class DataFrameNumericIndexing:
-
def setup(self):
self.idx_dupe = np.array(range(30)) * 99
self.df = DataFrame(np.random.randn(10000, 5))
@@ -172,13 +189,15 @@ def time_bool_indexer(self):
class Take:
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': Int64Index(np.arange(N)),
- 'datetime': date_range('2011-01-01', freq='S', periods=N)}
+ indexes = {
+ "int": Int64Index(np.arange(N)),
+ "datetime": date_range("2011-01-01", freq="S", periods=N),
+ }
index = indexes[index]
self.s = Series(np.random.rand(N), index=index)
self.indexer = [True, False, True, True, False] * 20000
@@ -188,35 +207,39 @@ def time_take(self, index):
class MultiIndexing:
-
def setup(self):
mi = MultiIndex.from_product([range(1000), range(1000)])
self.s = Series(np.random.randn(1000000), index=mi)
self.df = DataFrame(self.s)
n = 100000
- self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000),
- n),
- 'B': np.random.choice(range(10, 400), n),
- 'C': np.random.choice(range(1, 150), n),
- 'D': np.random.choice(range(10000, 45000), n),
- 'x': np.random.choice(range(400), n),
- 'y': np.random.choice(range(25), n)})
+ with warnings.catch_warnings(record=True):
+ self.mdt = DataFrame(
+ {
+ "A": np.random.choice(range(10000, 45000, 1000), n),
+ "B": np.random.choice(range(10, 400), n),
+ "C": np.random.choice(range(1, 150), n),
+ "D": np.random.choice(range(10000, 45000), n),
+ "x": np.random.choice(range(400), n),
+ "y": np.random.choice(range(25), n),
+ }
+ )
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
- self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index()
+ self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
def time_series_ix(self):
- self.s.ix[999]
+ with warnings.catch_warnings(record=True):
+ self.s.ix[999]
def time_frame_ix(self):
- self.df.ix[999]
+ with warnings.catch_warnings(record=True):
+ self.df.ix[999]
def time_index_slice(self):
self.mdt.loc[self.idx, :]
class IntervalIndexing:
-
def setup_cache(self):
idx = IntervalIndex.from_breaks(np.arange(1000001))
monotonic = Series(np.arange(1000000), index=idx)
@@ -237,29 +260,30 @@ def time_loc_list(self, monotonic):
class CategoricalIndexIndexing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
indices = {
- 'monotonic_incr': CategoricalIndex(values),
- 'monotonic_decr': CategoricalIndex(reversed(values)),
- 'non_monotonic': CategoricalIndex(list('abc' * N))}
+ "monotonic_incr": CategoricalIndex(values),
+ "monotonic_decr": CategoricalIndex(reversed(values)),
+ "non_monotonic": CategoricalIndex(list("abc" * N)),
+ }
self.data = indices[index]
self.int_scalar = 10000
self.int_list = list(range(10000))
- self.cat_scalar = 'b'
- self.cat_list = ['a', 'c']
+ self.cat_scalar = "b"
+ self.cat_list = ["a", "c"]
def time_getitem_scalar(self, index):
self.data[self.int_scalar]
def time_getitem_slice(self, index):
- self.data[:self.int_scalar]
+ self.data[: self.int_scalar]
def time_getitem_list_like(self, index):
self.data[[self.int_scalar]]
@@ -278,7 +302,6 @@ def time_get_indexer_list(self, index):
class MethodLookup:
-
def setup_cache(self):
s = Series()
return s
@@ -287,47 +310,44 @@ def time_lookup_iloc(self, s):
s.iloc
def time_lookup_ix(self, s):
- s.ix
+ with warnings.catch_warnings(record=True):
+ s.ix
def time_lookup_loc(self, s):
s.loc
class GetItemSingleColumn:
-
def setup(self):
- self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A'])
+ self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=["A"])
self.df_int_col = DataFrame(np.random.randn(3000, 1))
def time_frame_getitem_single_column_label(self):
- self.df_string_col['A']
+ self.df_string_col["A"]
def time_frame_getitem_single_column_int(self):
self.df_int_col[0]
class AssignTimeseriesIndex:
-
def setup(self):
N = 100000
- idx = date_range('1/1/2000', periods=N, freq='H')
- self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx)
+ idx = date_range("1/1/2000", periods=N, freq="H")
+ self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)
def time_frame_assign_timeseries_index(self):
- self.df['date'] = self.df.index
+ self.df["date"] = self.df.index
class InsertColumns:
-
def setup(self):
- self.N = 10**3
+ self.N = 10 ** 3
self.df = DataFrame(index=range(self.N))
def time_insert(self):
np.random.seed(1234)
for i in range(100):
- self.df.insert(0, i, np.random.randn(self.N),
- allow_duplicates=True)
+ self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)
def time_assign_with_setitem(self):
np.random.seed(1234)
@@ -335,4 +355,20 @@ def time_assign_with_setitem(self):
self.df[i] = np.random.randn(self.N)
+class ChainIndexing:
+
+ params = [None, "warn"]
+ param_names = ["mode"]
+
+ def setup(self, mode):
+ self.N = 1000000
+
+ def time_chained_indexing(self, mode):
+ with warnings.catch_warnings(record=True):
+ with option_context("mode.chained_assignment", mode):
+ df = DataFrame({"A": np.arange(self.N), "B": "foo"})
+ df2 = df[df.A > self.N // 2]
+ df2["C"] = 1.0
+
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
index 5655701781846..44a22dfa77791 100644
--- a/asv_bench/benchmarks/indexing_engines.py
+++ b/asv_bench/benchmarks/indexing_engines.py
@@ -5,33 +5,40 @@
def _get_numeric_engines():
engine_names = [
- ('Int64Engine', np.int64), ('Int32Engine', np.int32),
- ('Int16Engine', np.int16), ('Int8Engine', np.int8),
- ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
- ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
- ('Float64Engine', np.float64), ('Float32Engine', np.float32),
+ ("Int64Engine", np.int64),
+ ("Int32Engine", np.int32),
+ ("Int16Engine", np.int16),
+ ("Int8Engine", np.int8),
+ ("UInt64Engine", np.uint64),
+ ("UInt32Engine", np.uint32),
+ ("UInt16engine", np.uint16),
+ ("UInt8Engine", np.uint8),
+ ("Float64Engine", np.float64),
+ ("Float32Engine", np.float32),
+ ]
+ return [
+ (getattr(libindex, engine_name), dtype)
+ for engine_name, dtype in engine_names
+ if hasattr(libindex, engine_name)
]
- return [(getattr(libindex, engine_name), dtype)
- for engine_name, dtype in engine_names
- if hasattr(libindex, engine_name)]
class NumericEngineIndexing:
- params = [_get_numeric_engines(),
- ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
- ]
- param_names = ['engine_and_dtype', 'index_type']
+ params = [
+ _get_numeric_engines(),
+ ["monotonic_incr", "monotonic_decr", "non_monotonic"],
+ ]
+ param_names = ["engine_and_dtype", "index_type"]
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype
- N = 10**5
+ N = 10 ** 5
values = list([1] * N + [2] * N + [3] * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=dtype),
- 'monotonic_decr': np.array(list(reversed(values)),
- dtype=dtype),
- 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
+ "monotonic_incr": np.array(values, dtype=dtype),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=dtype),
+ "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
}[index_type]
self.data = engine(lambda: arr, len(arr))
@@ -44,21 +51,21 @@ def time_get_loc(self, engine_and_dtype, index_type):
class ObjectEngineIndexing:
- params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
- param_names = ['index_type']
+ params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
+ param_names = ["index_type"]
def setup(self, index_type):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=object),
- 'monotonic_decr': np.array(list(reversed(values)), dtype=object),
- 'non_monotonic': np.array(list('abc') * N, dtype=object),
+ "monotonic_incr": np.array(values, dtype=object),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=object),
+ "non_monotonic": np.array(list("abc") * N, dtype=object),
}[index_type]
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
- self.data.get_loc('b')
+ self.data.get_loc("b")
def time_get_loc(self, index_type):
- self.data.get_loc('b')
+ self.data.get_loc("b")
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
index 065c82207d251..66ef4f2aec380 100644
--- a/asv_bench/benchmarks/inference.py
+++ b/asv_bench/benchmarks/inference.py
@@ -8,56 +8,57 @@
class NumericInferOps:
# from GH 7332
params = numeric_dtypes
- param_names = ['dtype']
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 5 * 10**5
- self.df = DataFrame({'A': np.arange(N).astype(dtype),
- 'B': np.arange(N).astype(dtype)})
+ N = 5 * 10 ** 5
+ self.df = DataFrame(
+ {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)}
+ )
def time_add(self, dtype):
- self.df['A'] + self.df['B']
+ self.df["A"] + self.df["B"]
def time_subtract(self, dtype):
- self.df['A'] - self.df['B']
+ self.df["A"] - self.df["B"]
def time_multiply(self, dtype):
- self.df['A'] * self.df['B']
+ self.df["A"] * self.df["B"]
def time_divide(self, dtype):
- self.df['A'] / self.df['B']
+ self.df["A"] / self.df["B"]
def time_modulo(self, dtype):
- self.df['A'] % self.df['B']
+ self.df["A"] % self.df["B"]
class DateInferOps:
# from GH 7332
def setup_cache(self):
- N = 5 * 10**5
- df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
- df['timedelta'] = df['datetime64'] - df['datetime64']
+ N = 5 * 10 ** 5
+ df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")})
+ df["timedelta"] = df["datetime64"] - df["datetime64"]
return df
def time_subtract_datetimes(self, df):
- df['datetime64'] - df['datetime64']
+ df["datetime64"] - df["datetime64"]
def time_timedelta_plus_datetime(self, df):
- df['timedelta'] + df['datetime64']
+ df["timedelta"] + df["datetime64"]
def time_add_timedeltas(self, df):
- df['timedelta'] + df['timedelta']
+ df["timedelta"] + df["timedelta"]
class ToNumeric:
- params = ['ignore', 'coerce']
- param_names = ['errors']
+ params = ["ignore", "coerce"]
+ param_names = ["errors"]
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
- self.numstr = self.float.astype('str')
+ self.numstr = self.float.astype("str")
self.str = Series(tm.makeStringIndex(N))
def time_from_float(self, errors):
@@ -72,21 +73,32 @@ def time_from_str(self, errors):
class ToNumericDowncast:
- param_names = ['dtype', 'downcast']
- params = [['string-float', 'string-int', 'string-nint', 'datetime64',
- 'int-list', 'int32'],
- [None, 'integer', 'signed', 'unsigned', 'float']]
+ param_names = ["dtype", "downcast"]
+ params = [
+ [
+ "string-float",
+ "string-int",
+ "string-nint",
+ "datetime64",
+ "int-list",
+ "int32",
+ ],
+ [None, "integer", "signed", "unsigned", "float"],
+ ]
N = 500000
N2 = int(N / 2)
- data_dict = {'string-int': ['1'] * N2 + [2] * N2,
- 'string-nint': ['-1'] * N2 + [2] * N2,
- 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
- dtype='datetime64[D]'), N),
- 'string-float': ['1.1'] * N2 + [2] * N2,
- 'int-list': [1] * N2 + [2] * N2,
- 'int32': np.repeat(np.int32(1), N)}
+ data_dict = {
+ "string-int": ["1"] * N2 + [2] * N2,
+ "string-nint": ["-1"] * N2 + [2] * N2,
+ "datetime64": np.repeat(
+ np.array(["1970-01-01", "1970-01-02"], dtype="datetime64[D]"), N
+ ),
+ "string-float": ["1.1"] * N2 + [2] * N2,
+ "int-list": [1] * N2 + [2] * N2,
+ "int32": np.repeat(np.int32(1), N),
+ }
def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]
@@ -96,10 +108,9 @@ def time_downcast(self, dtype, downcast):
class MaybeConvertNumeric:
-
def setup_cache(self):
- N = 10**6
- arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
+ N = 10 ** 6
+ arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64")
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 6beb21883b5ab..4525e504fc4dd 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -4,7 +4,6 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
-from pandas.io.parsers import _parser_defaults
from io import StringIO
from ..pandas_vb_common import BaseIO
@@ -12,27 +11,31 @@
class ToCSV(BaseIO):
- fname = '__test__.csv'
- params = ['wide', 'long', 'mixed']
- param_names = ['kind']
+ fname = "__test__.csv"
+ params = ["wide", "long", "mixed"]
+ param_names = ["kind"]
def setup(self, kind):
wide_frame = DataFrame(np.random.randn(3000, 30))
- long_frame = DataFrame({'A': np.arange(50000),
- 'B': np.arange(50000) + 1.,
- 'C': np.arange(50000) + 2.,
- 'D': np.arange(50000) + 3.})
- mixed_frame = DataFrame({'float': np.random.randn(5000),
- 'int': np.random.randn(5000).astype(int),
- 'bool': (np.arange(5000) % 2) == 0,
- 'datetime': date_range('2001',
- freq='s',
- periods=5000),
- 'object': ['foo'] * 5000})
- mixed_frame.loc[30:500, 'float'] = np.nan
- data = {'wide': wide_frame,
- 'long': long_frame,
- 'mixed': mixed_frame}
+ long_frame = DataFrame(
+ {
+ "A": np.arange(50000),
+ "B": np.arange(50000) + 1.0,
+ "C": np.arange(50000) + 2.0,
+ "D": np.arange(50000) + 3.0,
+ }
+ )
+ mixed_frame = DataFrame(
+ {
+ "float": np.random.randn(5000),
+ "int": np.random.randn(5000).astype(int),
+ "bool": (np.arange(5000) % 2) == 0,
+ "datetime": date_range("2001", freq="s", periods=5000),
+ "object": ["foo"] * 5000,
+ }
+ )
+ mixed_frame.loc[30:500, "float"] = np.nan
+ data = {"wide": wide_frame, "long": long_frame, "mixed": mixed_frame}
self.df = data[kind]
def time_frame(self, kind):
@@ -41,36 +44,39 @@ def time_frame(self, kind):
class ToCSVDatetime(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
- rng = date_range('1/1/2000', periods=1000)
+ rng = date_range("1/1/2000", periods=1000)
self.data = DataFrame(rng, index=rng)
def time_frame_date_formatting(self):
- self.data.to_csv(self.fname, date_format='%Y%m%d')
+ self.data.to_csv(self.fname, date_format="%Y%m%d")
class ToCSVDatetimeBig(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
timeout = 1500
params = [1000, 10000, 100000]
- param_names = ['obs']
+ param_names = ["obs"]
def setup(self, obs):
- d = '2018-11-29'
- dt = '2018-11-26 11:18:27.0'
- self.data = DataFrame({'dt': [np.datetime64(dt)] * obs,
- 'd': [np.datetime64(d)] * obs,
- 'r': [np.random.uniform()] * obs})
+ d = "2018-11-29"
+ dt = "2018-11-26 11:18:27.0"
+ self.data = DataFrame(
+ {
+ "dt": [np.datetime64(dt)] * obs,
+ "d": [np.datetime64(d)] * obs,
+ "r": [np.random.uniform()] * obs,
+ }
+ )
def time_frame(self, obs):
self.data.to_csv(self.fname)
class StringIORewind:
-
def data(self, stringio_object):
stringio_object.seek(0)
return stringio_object
@@ -78,68 +84,84 @@ def data(self, stringio_object):
class ReadCSVDInferDatetimeFormat(StringIORewind):
- params = ([True, False], ['custom', 'iso8601', 'ymd'])
- param_names = ['infer_datetime_format', 'format']
+ params = ([True, False], ["custom", "iso8601", "ymd"])
+ param_names = ["infer_datetime_format", "format"]
def setup(self, infer_datetime_format, format):
- rng = date_range('1/1/2000', periods=1000)
- formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
- 'iso8601': '%Y-%m-%d %H:%M:%S',
- 'ymd': '%Y%m%d'}
+ rng = date_range("1/1/2000", periods=1000)
+ formats = {
+ "custom": "%m/%d/%Y %H:%M:%S.%f",
+ "iso8601": "%Y-%m-%d %H:%M:%S",
+ "ymd": "%Y%m%d",
+ }
dt_format = formats[format]
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(dt_format).tolist()))
+ self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist()))
def time_read_csv(self, infer_datetime_format, format):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=infer_datetime_format)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=infer_datetime_format,
+ )
class ReadCSVConcatDatetime(StringIORewind):
- iso8601 = '%Y-%m-%d %H:%M:%S'
+ iso8601 = "%Y-%m-%d %H:%M:%S"
def setup(self):
- rng = date_range('1/1/2000', periods=50000, freq='S')
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(self.iso8601).tolist()))
+ rng = date_range("1/1/2000", periods=50000, freq="S")
+ self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist()))
def time_read_csv(self):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
- params = (['nan', '0', ''],)
- param_names = ['bad_date_value']
+ params = (["nan", "0", ""],)
+ param_names = ["bad_date_value"]
def setup(self, bad_date_value):
- self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000)
+ self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000)
def time_read_csv(self, bad_date_value):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo', 'bar'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo", "bar"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVSkipRows(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
params = [None, 10000]
- param_names = ['skiprows']
+ param_names = ["skiprows"]
def setup(self, skiprows):
N = 20000
index = tm.makeStringIndex(N)
- df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ df = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
df.to_csv(self.fname)
def time_skipprows(self, skiprows):
@@ -147,31 +169,31 @@ def time_skipprows(self, skiprows):
class ReadUint64Integers(StringIORewind):
-
def setup(self):
- self.na_values = [2**63 + 500]
- arr = np.arange(10000).astype('uint64') + 2**63
- self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.na_values = [2 ** 63 + 500]
+ arr = np.arange(10000).astype("uint64") + 2 ** 63
+ self.data1 = StringIO("\n".join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
- self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.data2 = StringIO("\n".join(arr.astype(str).tolist()))
def time_read_uint64(self):
- read_csv(self.data(self.data1), header=None, names=['foo'])
+ read_csv(self.data(self.data1), header=None, names=["foo"])
def time_read_uint64_neg_values(self):
- read_csv(self.data(self.data2), header=None, names=['foo'])
+ read_csv(self.data(self.data2), header=None, names=["foo"])
def time_read_uint64_na_values(self):
- read_csv(self.data(self.data1), header=None, names=['foo'],
- na_values=self.na_values)
+ read_csv(
+ self.data(self.data1), header=None, names=["foo"], na_values=self.na_values
+ )
class ReadCSVThousands(BaseIO):
- fname = '__test__.csv'
- params = ([',', '|'], [None, ','])
- param_names = ['sep', 'thousands']
+ fname = "__test__.csv"
+ params = ([",", "|"], [None, ","])
+ param_names = ["sep", "thousands"]
def setup(self, sep, thousands):
N = 10000
@@ -179,8 +201,8 @@ def setup(self, sep, thousands):
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
df = DataFrame(data)
if thousands is not None:
- fmt = ':{}'.format(thousands)
- fmt = '{' + fmt + '}'
+ fmt = ":{}".format(thousands)
+ fmt = "{" + fmt + "}"
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)
@@ -189,57 +211,68 @@ def time_thousands(self, sep, thousands):
class ReadCSVComment(StringIORewind):
-
def setup(self):
- data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
- self.StringIO_input = StringIO('\n'.join(data))
+ data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
+ self.StringIO_input = StringIO("\n".join(data))
def time_comment(self):
- read_csv(self.data(self.StringIO_input), comment='#',
- header=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
+ )
class ReadCSVFloatPrecision(StringIORewind):
- params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
- param_names = ['sep', 'decimal', 'float_precision']
+ params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
+ param_names = ["sep", "decimal", "float_precision"]
def setup(self, sep, decimal, float_precision):
- floats = [''.join(random.choice(string.digits) for _ in range(28))
- for _ in range(15)]
- rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
+ floats = [
+ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15)
+ ]
+ rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n"
data = rows * 5
data = data.format(*floats) * 200 # 1000 x 3 strings csv
self.StringIO_input = StringIO(data)
def time_read_csv(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- names=list('abc'), float_precision=float_precision)
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ names=list("abc"),
+ float_precision=float_precision,
+ )
def time_read_csv_python_engine(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- engine='python', float_precision=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ engine="python",
+ float_precision=None,
+ names=list("abc"),
+ )
class ReadCSVCategorical(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
N = 100000
- group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
- df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
+ group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
+ df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
df.to_csv(self.fname, index=False)
def time_convert_post(self):
read_csv(self.fname).apply(Categorical)
def time_convert_direct(self):
- read_csv(self.fname, dtype='category')
+ read_csv(self.fname, dtype="category")
class ReadCSVParseDates(StringIORewind):
-
def setup(self):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
@@ -247,38 +280,50 @@ def setup(self):
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
"""
- two_cols = ['KORD,19990127'] * 5
+ two_cols = ["KORD,19990127"] * 5
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)
def time_multiple_date(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=list(string.digits[:9]),
- parse_dates=[[1, 2], [1, 3]])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=list(string.digits[:9]),
+ parse_dates=[[1, 2], [1, 3]],
+ )
def time_baseline(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- parse_dates=[1],
- names=list(string.digits[:9]))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ parse_dates=[1],
+ names=list(string.digits[:9]),
+ )
class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
- param_names = ['do_cache']
+ param_names = ["do_cache"]
def setup(self, do_cache):
- data = ('\n'.join('10/{}'.format(year)
- for year in range(2000, 2100)) + '\n') * 10
+ data = (
+ "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n"
+ ) * 10
self.StringIO_input = StringIO(data)
def time_read_csv_cached(self, do_cache):
- # kwds setting here is used to avoid breaking tests in
- # previous version of pandas, because this is api changes
- kwds = {}
- if 'cache_dates' in _parser_defaults:
- kwds['cache_dates'] = do_cache
- read_csv(self.data(self.StringIO_input), header=None,
- parse_dates=[0], **kwds)
+ try:
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ parse_dates=[0],
+ cache_dates=do_cache,
+ )
+ except TypeError:
+ # cache_dates is a new keyword in 0.25
+ pass
class ReadCSVMemoryGrowth(BaseIO):
@@ -301,12 +346,12 @@ def mem_parser_chunks(self):
class ReadCSVParseSpecialDate(StringIORewind):
- params = (['mY', 'mdY', 'hm'],)
- param_names = ['value']
+ params = (["mY", "mdY", "hm"],)
+ param_names = ["value"]
objects = {
- 'mY': '01-2019\n10-2019\n02/2000\n',
- 'mdY': '12/02/2010\n',
- 'hm': '21:34\n'
+ "mY": "01-2019\n10-2019\n02/2000\n",
+ "mdY": "12/02/2010\n",
+ "hm": "21:34\n",
}
def setup(self, value):
@@ -315,33 +360,50 @@ def setup(self, value):
self.StringIO_input = StringIO(data)
def time_read_special_date(self, value):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ )
class ParseDateComparison(StringIORewind):
params = ([False, True],)
- param_names = ['cache_dates']
+ param_names = ["cache_dates"]
def setup(self, cache_dates):
count_elem = 10000
- data = '12-02-2010\n' * count_elem
+ data = "12-02-2010\n" * count_elem
self.StringIO_input = StringIO(data)
def time_read_csv_dayfirst(self, cache_dates):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
- dayfirst=True)
+ try:
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ cache_dates=cache_dates,
+ dayfirst=True,
+ )
+ except TypeError:
+ # cache_dates is a new keyword in 0.25
+ pass
def time_to_datetime_dayfirst(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, dayfirst=True)
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, dayfirst=True)
def time_to_datetime_format_DD_MM_YYYY(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y')
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y")
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
index 1decb83f2f723..12e70f84e5203 100644
--- a/asv_bench/benchmarks/io/excel.py
+++ b/asv_bench/benchmarks/io/excel.py
@@ -6,19 +6,21 @@
class Excel:
- params = ['openpyxl', 'xlsxwriter', 'xlwt']
- param_names = ['engine']
+ params = ["openpyxl", "xlsxwriter", "xlwt"]
+ param_names = ["engine"]
def setup(self, engine):
N = 2000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
- self.df.to_excel(self.writer_read, sheet_name='Sheet1')
+ self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)
@@ -29,7 +31,7 @@ def time_write_excel(self, engine):
bio_write = BytesIO()
bio_write.seek(0)
writer_write = ExcelWriter(bio_write, engine=engine)
- self.df.to_excel(writer_write, sheet_name='Sheet1')
+ self.df.to_excel(writer_write, sheet_name="Sheet1")
writer_write.save()
diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py
index a5dc28eb9508c..2874a7889156b 100644
--- a/asv_bench/benchmarks/io/hdf.py
+++ b/asv_bench/benchmarks/io/hdf.py
@@ -6,86 +6,92 @@
class HDFStoreDataFrame(BaseIO):
-
def setup(self):
N = 25000
index = tm.makeStringIndex(N)
- self.df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=index)
- self.df_mixed = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ self.df = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index
+ )
+ self.df_mixed = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
self.df_wide = DataFrame(np.random.randn(N, 100))
self.start_wide = self.df_wide.index[10000]
self.stop_wide = self.df_wide.index[15000]
- self.df2 = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=date_range('1/1/2000', periods=N))
+ self.df2 = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)},
+ index=date_range("1/1/2000", periods=N),
+ )
self.start = self.df2.index[10000]
self.stop = self.df2.index[15000]
- self.df_wide2 = DataFrame(np.random.randn(N, 100),
- index=date_range('1/1/2000', periods=N))
- self.df_dc = DataFrame(np.random.randn(N, 10),
- columns=['C%03d' % i for i in range(10)])
+ self.df_wide2 = DataFrame(
+ np.random.randn(N, 100), index=date_range("1/1/2000", periods=N)
+ )
+ self.df_dc = DataFrame(
+ np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)]
+ )
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
self.store = HDFStore(self.fname)
- self.store.put('fixed', self.df)
- self.store.put('fixed_mixed', self.df_mixed)
- self.store.append('table', self.df2)
- self.store.append('table_mixed', self.df_mixed)
- self.store.append('table_wide', self.df_wide)
- self.store.append('table_wide2', self.df_wide2)
+ self.store.put("fixed", self.df)
+ self.store.put("fixed_mixed", self.df_mixed)
+ self.store.append("table", self.df2)
+ self.store.append("table_mixed", self.df_mixed)
+ self.store.append("table_wide", self.df_wide)
+ self.store.append("table_wide2", self.df_wide2)
def teardown(self):
self.store.close()
self.remove(self.fname)
def time_read_store(self):
- self.store.get('fixed')
+ self.store.get("fixed")
def time_read_store_mixed(self):
- self.store.get('fixed_mixed')
+ self.store.get("fixed_mixed")
def time_write_store(self):
- self.store.put('fixed_write', self.df)
+ self.store.put("fixed_write", self.df)
def time_write_store_mixed(self):
- self.store.put('fixed_mixed_write', self.df_mixed)
+ self.store.put("fixed_mixed_write", self.df_mixed)
def time_read_store_table_mixed(self):
- self.store.select('table_mixed')
+ self.store.select("table_mixed")
def time_write_store_table_mixed(self):
- self.store.append('table_mixed_write', self.df_mixed)
+ self.store.append("table_mixed_write", self.df_mixed)
def time_read_store_table(self):
- self.store.select('table')
+ self.store.select("table")
def time_write_store_table(self):
- self.store.append('table_write', self.df)
+ self.store.append("table_write", self.df)
def time_read_store_table_wide(self):
- self.store.select('table_wide')
+ self.store.select("table_wide")
def time_write_store_table_wide(self):
- self.store.append('table_wide_write', self.df_wide)
+ self.store.append("table_wide_write", self.df_wide)
def time_write_store_table_dc(self):
- self.store.append('table_dc_write', self.df_dc, data_columns=True)
+ self.store.append("table_dc_write", self.df_dc, data_columns=True)
def time_query_store_table_wide(self):
- self.store.select('table_wide', where="index > self.start_wide and "
- "index < self.stop_wide")
+ self.store.select(
+ "table_wide", where="index > self.start_wide and " "index < self.stop_wide"
+ )
def time_query_store_table(self):
- self.store.select('table', where="index > self.start and "
- "index < self.stop")
+ self.store.select("table", where="index > self.start and " "index < self.stop")
def time_store_repr(self):
repr(self.store)
@@ -99,24 +105,26 @@ def time_store_info(self):
class HDF(BaseIO):
- params = ['table', 'fixed']
- param_names = ['format']
+ params = ["table", "fixed"]
+ param_names = ["format"]
def setup(self, format):
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
+ self.df.to_hdf(self.fname, "df", format=format)
def time_read_hdf(self, format):
- read_hdf(self.fname, 'df')
+ read_hdf(self.fname, "df")
def time_write_hdf(self, format):
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df.to_hdf(self.fname, "df", format=format)
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
index 19d11e6610198..fc07f2a484102 100644
--- a/asv_bench/benchmarks/io/json.py
+++ b/asv_bench/benchmarks/io/json.py
@@ -8,16 +8,20 @@
class ReadJSON(BaseIO):
fname = "__test__.json"
- params = (['split', 'index', 'records'], ['int', 'datetime'])
- param_names = ['orient', 'index']
+ params = (["split", "index", "records"], ["int", "datetime"])
+ param_names = ["orient", "index"]
def setup(self, orient, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
df.to_json(self.fname, orient=orient)
def time_read_json(self, orient, index):
@@ -27,121 +31,185 @@ def time_read_json(self, orient, index):
class ReadJSONLines(BaseIO):
fname = "__test_lines__.json"
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
- df.to_json(self.fname, orient='records', lines=True)
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
+ df.to_json(self.fname, orient="records", lines=True)
def time_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def time_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
def peakmem_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def peakmem_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
class ToJSON(BaseIO):
fname = "__test__.json"
- params = ['split', 'columns', 'index']
- param_names = ['orient']
+ params = [
+ ["split", "columns", "index", "values", "records"],
+ ["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"],
+ ]
+ param_names = ["orient", "frame"]
+
+ def setup(self, orient, frame):
+ N = 10 ** 5
+ ncols = 5
+ index = date_range("20000101", periods=N, freq="H")
+ timedeltas = timedelta_range(start=1, periods=N, freq="s")
+ datetimes = date_range(start=1, periods=N, freq="s")
+ ints = np.random.randint(100000000, size=N)
+ floats = np.random.randn(N)
+ strings = tm.makeStringIndex(N)
+ self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
+ self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
+ self.df_td_int_ts = DataFrame(
+ {
+ "td_1": timedeltas,
+ "td_2": timedeltas,
+ "int_1": ints,
+ "int_2": ints,
+ "ts_1": datetimes,
+ "ts_2": datetimes,
+ },
+ index=index,
+ )
+ self.df_int_floats = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "int_3": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "float_3": floats,
+ },
+ index=index,
+ )
+ self.df_int_float_str = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "str_1": strings,
+ "str_2": strings,
+ },
+ index=index,
+ )
+
+ def time_to_json(self, orient, frame):
+ getattr(self, frame).to_json(self.fname, orient=orient)
+
+ def mem_to_json(self, orient, frame):
+ getattr(self, frame).to_json(self.fname, orient=orient)
+
+ def time_to_json_wide(self, orient, frame):
+ base_df = getattr(self, frame).copy()
+ df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
+ df.to_json(self.fname, orient=orient)
- def setup(self, lines_orient):
- N = 10**5
+ def mem_to_json_wide(self, orient, frame):
+ base_df = getattr(self, frame).copy()
+ df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
+ df.to_json(self.fname, orient=orient)
+
+
+class ToJSONLines(BaseIO):
+
+ fname = "__test__.json"
+
+ def setup(self):
+ N = 10 ** 5
ncols = 5
- index = date_range('20000101', periods=N, freq='H')
- timedeltas = timedelta_range(start=1, periods=N, freq='s')
- datetimes = date_range(start=1, periods=N, freq='s')
+ index = date_range("20000101", periods=N, freq="H")
+ timedeltas = timedelta_range(start=1, periods=N, freq="s")
+ datetimes = date_range(start=1, periods=N, freq="s")
ints = np.random.randint(100000000, size=N)
floats = np.random.randn(N)
strings = tm.makeStringIndex(N)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
- self.df_td_int_ts = DataFrame({'td_1': timedeltas,
- 'td_2': timedeltas,
- 'int_1': ints,
- 'int_2': ints,
- 'ts_1': datetimes,
- 'ts_2': datetimes},
- index=index)
- self.df_int_floats = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'int_3': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'float_3': floats},
- index=index)
- self.df_int_float_str = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'str_1': strings,
- 'str_2': strings},
- index=index)
-
- def time_floats_with_int_index(self, orient):
- self.df.to_json(self.fname, orient=orient)
-
- def time_floats_with_dt_index(self, orient):
- self.df_date_idx.to_json(self.fname, orient=orient)
-
- def time_delta_int_tstamp(self, orient):
- self.df_td_int_ts.to_json(self.fname, orient=orient)
-
- def time_float_int(self, orient):
- self.df_int_floats.to_json(self.fname, orient=orient)
-
- def time_float_int_str(self, orient):
- self.df_int_float_str.to_json(self.fname, orient=orient)
-
- def time_floats_with_int_idex_lines(self, orient):
- self.df.to_json(self.fname, orient='records', lines=True)
-
- def time_floats_with_dt_index_lines(self, orient):
- self.df_date_idx.to_json(self.fname, orient='records', lines=True)
-
- def time_delta_int_tstamp_lines(self, orient):
- self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)
-
- def time_float_int_lines(self, orient):
- self.df_int_floats.to_json(self.fname, orient='records', lines=True)
-
- def time_float_int_str_lines(self, orient):
- self.df_int_float_str.to_json(self.fname, orient='records', lines=True)
+ self.df_td_int_ts = DataFrame(
+ {
+ "td_1": timedeltas,
+ "td_2": timedeltas,
+ "int_1": ints,
+ "int_2": ints,
+ "ts_1": datetimes,
+ "ts_2": datetimes,
+ },
+ index=index,
+ )
+ self.df_int_floats = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "int_3": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "float_3": floats,
+ },
+ index=index,
+ )
+ self.df_int_float_str = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "str_1": strings,
+ "str_2": strings,
+ },
+ index=index,
+ )
+
+ def time_floats_with_int_idex_lines(self):
+ self.df.to_json(self.fname, orient="records", lines=True)
+
+ def time_floats_with_dt_index_lines(self):
+ self.df_date_idx.to_json(self.fname, orient="records", lines=True)
+
+ def time_delta_int_tstamp_lines(self):
+ self.df_td_int_ts.to_json(self.fname, orient="records", lines=True)
+
+ def time_float_int_lines(self):
+ self.df_int_floats.to_json(self.fname, orient="records", lines=True)
+
+ def time_float_int_str_lines(self):
+ self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
class ToJSONMem:
-
def setup_cache(self):
df = DataFrame([[1]])
- frames = {
- 'int': df,
- 'float': df.astype(float),
- }
+ frames = {"int": df, "float": df.astype(float)}
return frames
def peakmem_int(self, frames):
- df = frames['int']
+ df = frames["int"]
for _ in range(100_000):
df.to_json()
def peakmem_float(self, frames):
- df = frames['float']
+ df = frames["float"]
for _ in range(100_000):
df.to_json()
diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py
index dc2642d920fd0..d97b4ae13f0bd 100644
--- a/asv_bench/benchmarks/io/msgpack.py
+++ b/asv_bench/benchmarks/io/msgpack.py
@@ -1,3 +1,4 @@
+import warnings
import numpy as np
from pandas import DataFrame, date_range, read_msgpack
import pandas.util.testing as tm
@@ -6,16 +7,18 @@
class MSGPack(BaseIO):
-
def setup(self):
- self.fname = '__test__.msg'
+ self.fname = "__test__.msg"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
- self.df.to_msgpack(self.fname)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
+ with warnings.catch_warnings(record=True):
+ self.df.to_msgpack(self.fname)
def time_read_msgpack(self):
read_msgpack(self.fname)
diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
index edba0358c821a..c5e099bd44eac 100644
--- a/asv_bench/benchmarks/io/parsers.py
+++ b/asv_bench/benchmarks/io/parsers.py
@@ -2,16 +2,18 @@
try:
from pandas._libs.tslibs.parsing import (
- _concat_date_cols, _does_string_look_like_datetime)
+ _concat_date_cols,
+ _does_string_look_like_datetime,
+ )
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
-class DoesStringLookLikeDatetime(object):
+class DoesStringLookLikeDatetime:
- params = (['2Q2005', '0.0', '10000'],)
- param_names = ['value']
+ params = (["2Q2005", "0.0", "10000"],)
+ param_names = ["value"]
def setup(self, value):
self.objects = [value] * 1000000
@@ -21,18 +23,20 @@ def time_check_datetimes(self, value):
_does_string_look_like_datetime(obj)
-class ConcatDateCols(object):
+class ConcatDateCols:
- params = ([1234567890, 'AAAA'], [1, 2])
- param_names = ['value', 'dim']
+ params = ([1234567890, "AAAA"], [1, 2])
+ param_names = ["value", "dim"]
def setup(self, value, dim):
count_elem = 10000
if dim == 1:
self.object = (np.array([value] * count_elem),)
if dim == 2:
- self.object = (np.array([value] * count_elem),
- np.array([value] * count_elem))
+ self.object = (
+ np.array([value] * count_elem),
+ np.array([value] * count_elem),
+ )
def time_check_concat(self, value, dim):
_concat_date_cols(self.object)
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
index 74a58bbb946aa..286ac767c02e7 100644
--- a/asv_bench/benchmarks/io/pickle.py
+++ b/asv_bench/benchmarks/io/pickle.py
@@ -6,15 +6,16 @@
class Pickle(BaseIO):
-
def setup(self):
- self.fname = '__test__.pkl'
+ self.fname = "__test__.pkl"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.df.to_pickle(self.fname)
def time_read_pickle(self):
diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py
index 8181f1d41ac70..7ce8ef8c12639 100644
--- a/asv_bench/benchmarks/io/sas.py
+++ b/asv_bench/benchmarks/io/sas.py
@@ -5,15 +5,25 @@
class SAS:
- params = ['sas7bdat', 'xport']
- param_names = ['format']
+ params = ["sas7bdat", "xport"]
+ param_names = ["format"]
def setup(self, format):
# Read files that are located in 'pandas/io/tests/sas/data'
- files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'}
+ files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"}
file = files[format]
- paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas',
- 'tests', 'io', 'sas', 'data', file]
+ paths = [
+ os.path.dirname(__file__),
+ "..",
+ "..",
+ "..",
+ "pandas",
+ "tests",
+ "io",
+ "sas",
+ "data",
+ file,
+ ]
self.f = os.path.join(*paths)
def time_read_msgpack(self, format):
diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py
index ee48f3bd0a3ab..b80872b17a9e4 100644
--- a/asv_bench/benchmarks/io/sql.py
+++ b/asv_bench/benchmarks/io/sql.py
@@ -8,31 +8,35 @@
class SQL:
- params = ['sqlalchemy', 'sqlite']
- param_names = ['connection']
+ params = ["sqlalchemy", "sqlite"]
+ param_names = ["connection"]
def setup(self, connection):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_all = 'SELECT * FROM {}'.format(self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_all = "SELECT * FROM {}".format(self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe(self, connection):
- self.df.to_sql('test1', self.con, if_exists='replace')
+ self.df.to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query(self, connection):
read_sql_query(self.query_all, self.con)
@@ -40,85 +44,98 @@ def time_read_sql_query(self, connection):
class WriteSQLDtypes:
- params = (['sqlalchemy', 'sqlite'],
- ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'])
- param_names = ['connection', 'dtype']
+ params = (
+ ["sqlalchemy", "sqlite"],
+ ["float", "float_with_nan", "string", "bool", "int", "datetime"],
+ )
+ param_names = ["connection", "dtype"]
def setup(self, connection, dtype):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe_column(self, connection, dtype):
- self.df[[dtype]].to_sql('test1', self.con, if_exists='replace')
+ self.df[[dtype]].to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query_select_column(self, connection, dtype):
read_sql_query(self.query_col, self.con)
class ReadSQLTable:
-
def setup(self):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_all(self):
read_sql_table(self.table_name, self.con)
def time_read_sql_table_parse_dates(self):
- read_sql_table(self.table_name, self.con, columns=['datetime_string'],
- parse_dates=['datetime_string'])
+ read_sql_table(
+ self.table_name,
+ self.con,
+ columns=["datetime_string"],
+ parse_dates=["datetime_string"],
+ )
class ReadSQLTableDtypes:
- params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']
- param_names = ['dtype']
+ params = ["float", "float_with_nan", "string", "bool", "int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_column(self, dtype):
read_sql_table(self.table_name, self.con, columns=[dtype])
diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py
index fff10cf10a4d3..b3ed71af47dc8 100644
--- a/asv_bench/benchmarks/io/stata.py
+++ b/asv_bench/benchmarks/io/stata.py
@@ -7,26 +7,30 @@
class Stata(BaseIO):
- params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty']
- param_names = ['convert_dates']
+ params = ["tc", "td", "tm", "tw", "th", "tq", "ty"]
+ param_names = ["convert_dates"]
def setup(self, convert_dates):
- self.fname = '__test__.dta'
+ self.fname = "__test__.dta"
N = self.N = 100000
C = self.C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(self.N)
- self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
- np.iinfo(np.int8).max - 27, N)
- self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max - 27, N)
- self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min,
- np.iinfo(np.int32).max - 27, N)
- self.df['float32_'] = np.array(np.random.randn(N),
- dtype=np.float32)
- self.convert_dates = {'index': convert_dates}
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(self.N)
+ self.df["int8_"] = np.random.randint(
+ np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N
+ )
+ self.df["int16_"] = np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27, N
+ )
+ self.df["int32_"] = np.random.randint(
+ np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27, N
+ )
+ self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32)
+ self.convert_dates = {"index": convert_dates}
self.df.to_stata(self.fname, self.convert_dates)
def time_read_stata(self, convert_dates):
@@ -42,7 +46,7 @@ def setup(self, convert_dates):
for i in range(10):
missing_data = np.random.randn(self.N)
missing_data[missing_data < 0] = np.nan
- self.df['missing_{0}'.format(i)] = missing_data
+ self.df["missing_{0}".format(i)] = missing_data
self.df.to_stata(self.fname, self.convert_dates)
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index bbaba9909966e..7c899e3dc6ac8 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -2,8 +2,7 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex,
- date_range, concat, merge, merge_asof)
+from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof
try:
from pandas import merge_ordered
@@ -12,16 +11,14 @@
class Append:
-
def setup(self):
- self.df1 = DataFrame(np.random.randn(10000, 4),
- columns=['A', 'B', 'C', 'D'])
+ self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"])
self.df2 = self.df1.copy()
self.df2.index = np.arange(10000, 20000)
self.mdf1 = self.df1.copy()
- self.mdf1['obj1'] = 'bar'
- self.mdf1['obj2'] = 'bar'
- self.mdf1['int1'] = 5
+ self.mdf1["obj1"] = "bar"
+ self.mdf1["obj2"] = "bar"
+ self.mdf1["int1"] = 5
self.mdf1 = self.mdf1._consolidate()
self.mdf2 = self.mdf1.copy()
self.mdf2.index = self.df2.index
@@ -36,15 +33,16 @@ def time_append_mixed(self):
class Concat:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
N = 1000
s = Series(N, index=tm.makeStringIndex(N))
- self.series = [s[i:- i] for i in range(1, 10)] * 50
+ self.series = [s[i:-i] for i in range(1, 10)] * 50
self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
- df = DataFrame({'A': range(N)},
- index=date_range('20130101', periods=N, freq='s'))
+ df = DataFrame(
+ {"A": range(N)}, index=date_range("20130101", periods=N, freq="s")
+ )
self.empty_left = [DataFrame(), df]
self.empty_right = [df, DataFrame()]
self.mixed_ndims = [df, df.head(N // 2)]
@@ -68,14 +66,12 @@ def time_concat_mixed_ndims(self, axis):
class ConcatDataFrames:
params = ([0, 1], [True, False])
- param_names = ['axis', 'ignore_index']
+ param_names = ["axis", "ignore_index"]
def setup(self, axis, ignore_index):
- frame_c = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='C'))
+ frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C"))
self.frame_c = [frame_c] * 20
- frame_f = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='F'))
+ frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F"))
self.frame_f = [frame_f] * 20
def time_c_ordered(self, axis, ignore_index):
@@ -88,74 +84,78 @@ def time_f_ordered(self, axis, ignore_index):
class Join:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
level1 = tm.makeStringIndex(10).values
level2 = tm.makeStringIndex(1000).values
codes1 = np.arange(10).repeat(1000)
codes2 = np.tile(np.arange(1000), 10)
- index2 = MultiIndex(levels=[level1, level2],
- codes=[codes1, codes2])
- self.df_multi = DataFrame(np.random.randn(len(index2), 4),
- index=index2,
- columns=['A', 'B', 'C', 'D'])
+ index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
+ self.df_multi = DataFrame(
+ np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
+ )
self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
- self.df = DataFrame({'data1': np.random.randn(100000),
- 'data2': np.random.randn(100000),
- 'key1': self.key1,
- 'key2': self.key2})
-
- self.df_key1 = DataFrame(np.random.randn(len(level1), 4),
- index=level1,
- columns=['A', 'B', 'C', 'D'])
- self.df_key2 = DataFrame(np.random.randn(len(level2), 4),
- index=level2,
- columns=['A', 'B', 'C', 'D'])
+ self.df = DataFrame(
+ {
+ "data1": np.random.randn(100000),
+ "data2": np.random.randn(100000),
+ "key1": self.key1,
+ "key2": self.key2,
+ }
+ )
+
+ self.df_key1 = DataFrame(
+ np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
+ )
+ self.df_key2 = DataFrame(
+ np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
+ )
shuf = np.arange(100000)
np.random.shuffle(shuf)
self.df_shuf = self.df.reindex(self.df.index[shuf])
def time_join_dataframe_index_multi(self, sort):
- self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort)
+ self.df.join(self.df_multi, on=["key1", "key2"], sort=sort)
def time_join_dataframe_index_single_key_bigger(self, sort):
- self.df.join(self.df_key2, on='key2', sort=sort)
+ self.df.join(self.df_key2, on="key2", sort=sort)
def time_join_dataframe_index_single_key_small(self, sort):
- self.df.join(self.df_key1, on='key1', sort=sort)
+ self.df.join(self.df_key1, on="key1", sort=sort)
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
- self.df_shuf.join(self.df_key2, on='key2', sort=sort)
+ self.df_shuf.join(self.df_key2, on="key2", sort=sort)
class JoinIndex:
-
def setup(self):
N = 50000
- self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jim', 'joe'])
- self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jolie', 'jolia']).set_index('jolie')
+ self.left = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"]
+ )
+ self.right = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"]
+ ).set_index("jolie")
def time_left_outer_join_index(self):
- self.left.join(self.right, on='jim')
+ self.left.join(self.right, on="jim")
class JoinNonUnique:
# outer join of non-unique
# GH 6329
def setup(self):
- date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T')
- daily_dates = date_index.to_period('D').to_timestamp('S', 'S')
+ date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T")
+ daily_dates = date_index.to_period("D").to_timestamp("S", "S")
self.fracofday = date_index.values - daily_dates.values
- self.fracofday = self.fracofday.astype('timedelta64[ns]')
+ self.fracofday = self.fracofday.astype("timedelta64[ns]")
self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0
self.fracofday = Series(self.fracofday, daily_dates)
- index = date_range(date_index.min(), date_index.max(), freq='D')
+ index = date_range(date_index.min(), date_index.max(), freq="D")
self.temp = Series(1.0, index)[self.fracofday.index]
def time_join_non_unique_equal(self):
@@ -165,7 +165,7 @@ def time_join_non_unique_equal(self):
class Merge:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
N = 10000
@@ -173,17 +173,25 @@ def setup(self, sort):
indices2 = tm.makeStringIndex(N).values
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
- self.left = DataFrame({'key': key, 'key2': key2,
- 'value': np.random.randn(80000)})
- self.right = DataFrame({'key': indices[2000:],
- 'key2': indices2[2000:],
- 'value2': np.random.randn(8000)})
-
- self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2),
- 'key2': np.tile(np.arange(250).repeat(10), 4),
- 'value': np.random.randn(10000)})
- self.df2 = DataFrame({'key1': np.arange(500),
- 'value2': np.random.randn(500)})
+ self.left = DataFrame(
+ {"key": key, "key2": key2, "value": np.random.randn(80000)}
+ )
+ self.right = DataFrame(
+ {
+ "key": indices[2000:],
+ "key2": indices2[2000:],
+ "value2": np.random.randn(8000),
+ }
+ )
+
+ self.df = DataFrame(
+ {
+ "key1": np.tile(np.arange(500).repeat(10), 2),
+ "key2": np.tile(np.arange(250).repeat(10), 4),
+ "value": np.random.randn(10000),
+ }
+ )
+ self.df2 = DataFrame({"key1": np.arange(500), "value2": np.random.randn(500)})
self.df3 = self.df[:5000]
def time_merge_2intkey(self, sort):
@@ -193,125 +201,141 @@ def time_merge_dataframe_integer_2key(self, sort):
merge(self.df, self.df3, sort=sort)
def time_merge_dataframe_integer_key(self, sort):
- merge(self.df, self.df2, on='key1', sort=sort)
+ merge(self.df, self.df2, on="key1", sort=sort)
class I8Merge:
- params = ['inner', 'outer', 'left', 'right']
- param_names = ['how']
+ params = ["inner", "outer", "left", "right"]
+ param_names = ["how"]
def setup(self, how):
- low, high, n = -1000, 1000, 10**6
- self.left = DataFrame(np.random.randint(low, high, (n, 7)),
- columns=list('ABCDEFG'))
- self.left['left'] = self.left.sum(axis=1)
- self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
+ low, high, n = -1000, 1000, 10 ** 6
+ self.left = DataFrame(
+ np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")
+ )
+ self.left["left"] = self.left.sum(axis=1)
+ self.right = self.left.sample(frac=1).rename({"left": "right"}, axis=1)
self.right = self.right.reset_index(drop=True)
- self.right['right'] *= -1
+ self.right["right"] *= -1
def time_i8merge(self, how):
merge(self.left, self.right, how=how)
class MergeCategoricals:
-
def setup(self):
self.left_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Y": np.random.choice(["one", "two", "three"], size=(10000,)),
+ }
+ )
self.right_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
+ }
+ )
self.left_cat = self.left_object.assign(
- Y=self.left_object['Y'].astype('category'))
+ Y=self.left_object["Y"].astype("category")
+ )
self.right_cat = self.right_object.assign(
- Z=self.right_object['Z'].astype('category'))
+ Z=self.right_object["Z"].astype("category")
+ )
def time_merge_object(self):
- merge(self.left_object, self.right_object, on='X')
+ merge(self.left_object, self.right_object, on="X")
def time_merge_cat(self):
- merge(self.left_cat, self.right_cat, on='X')
+ merge(self.left_cat, self.right_cat, on="X")
class MergeOrdered:
-
def setup(self):
groups = tm.makeStringIndex(10).values
- self.left = DataFrame({'group': groups.repeat(5000),
- 'key': np.tile(np.arange(0, 10000, 2), 10),
- 'lvalue': np.random.randn(50000)})
- self.right = DataFrame({'key': np.arange(10000),
- 'rvalue': np.random.randn(10000)})
+ self.left = DataFrame(
+ {
+ "group": groups.repeat(5000),
+ "key": np.tile(np.arange(0, 10000, 2), 10),
+ "lvalue": np.random.randn(50000),
+ }
+ )
+ self.right = DataFrame(
+ {"key": np.arange(10000), "rvalue": np.random.randn(10000)}
+ )
def time_merge_ordered(self):
- merge_ordered(self.left, self.right, on='key', left_by='group')
+ merge_ordered(self.left, self.right, on="key", left_by="group")
class MergeAsof:
- params = [['backward', 'forward', 'nearest']]
- param_names = ['direction']
+ params = [["backward", "forward", "nearest"]]
+ param_names = ["direction"]
def setup(self, direction):
one_count = 200000
two_count = 1000000
df1 = DataFrame(
- {'time': np.random.randint(0, one_count / 20, one_count),
- 'key': np.random.choice(list(string.ascii_uppercase), one_count),
- 'key2': np.random.randint(0, 25, one_count),
- 'value1': np.random.randn(one_count)})
+ {
+ "time": np.random.randint(0, one_count / 20, one_count),
+ "key": np.random.choice(list(string.ascii_uppercase), one_count),
+ "key2": np.random.randint(0, 25, one_count),
+ "value1": np.random.randn(one_count),
+ }
+ )
df2 = DataFrame(
- {'time': np.random.randint(0, two_count / 20, two_count),
- 'key': np.random.choice(list(string.ascii_uppercase), two_count),
- 'key2': np.random.randint(0, 25, two_count),
- 'value2': np.random.randn(two_count)})
-
- df1 = df1.sort_values('time')
- df2 = df2.sort_values('time')
-
- df1['time32'] = np.int32(df1.time)
- df2['time32'] = np.int32(df2.time)
-
- self.df1a = df1[['time', 'value1']]
- self.df2a = df2[['time', 'value2']]
- self.df1b = df1[['time', 'key', 'value1']]
- self.df2b = df2[['time', 'key', 'value2']]
- self.df1c = df1[['time', 'key2', 'value1']]
- self.df2c = df2[['time', 'key2', 'value2']]
- self.df1d = df1[['time32', 'value1']]
- self.df2d = df2[['time32', 'value2']]
- self.df1e = df1[['time', 'key', 'key2', 'value1']]
- self.df2e = df2[['time', 'key', 'key2', 'value2']]
+ {
+ "time": np.random.randint(0, two_count / 20, two_count),
+ "key": np.random.choice(list(string.ascii_uppercase), two_count),
+ "key2": np.random.randint(0, 25, two_count),
+ "value2": np.random.randn(two_count),
+ }
+ )
+
+ df1 = df1.sort_values("time")
+ df2 = df2.sort_values("time")
+
+ df1["time32"] = np.int32(df1.time)
+ df2["time32"] = np.int32(df2.time)
+
+ self.df1a = df1[["time", "value1"]]
+ self.df2a = df2[["time", "value2"]]
+ self.df1b = df1[["time", "key", "value1"]]
+ self.df2b = df2[["time", "key", "value2"]]
+ self.df1c = df1[["time", "key2", "value1"]]
+ self.df2c = df2[["time", "key2", "value2"]]
+ self.df1d = df1[["time32", "value1"]]
+ self.df2d = df2[["time32", "value2"]]
+ self.df1e = df1[["time", "key", "key2", "value1"]]
+ self.df2e = df2[["time", "key", "key2", "value2"]]
def time_on_int(self, direction):
- merge_asof(self.df1a, self.df2a, on='time', direction=direction)
+ merge_asof(self.df1a, self.df2a, on="time", direction=direction)
def time_on_int32(self, direction):
- merge_asof(self.df1d, self.df2d, on='time32', direction=direction)
+ merge_asof(self.df1d, self.df2d, on="time32", direction=direction)
def time_by_object(self, direction):
- merge_asof(self.df1b, self.df2b, on='time', by='key',
- direction=direction)
+ merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction)
def time_by_int(self, direction):
- merge_asof(self.df1c, self.df2c, on='time', by='key2',
- direction=direction)
+ merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction)
def time_multiby(self, direction):
- merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
- direction=direction)
+ merge_asof(
+ self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction
+ )
class Align:
-
def setup(self):
- size = 5 * 10**5
- rng = np.arange(0, 10**13, 10**7)
- stamps = np.datetime64('now').view('i8') + rng
+ size = 5 * 10 ** 5
+ rng = np.arange(0, 10 ** 13, 10 ** 7)
+ stamps = np.datetime64("now").view("i8") + rng
idx1 = np.sort(np.random.choice(stamps, size, replace=False))
idx2 = np.sort(np.random.choice(stamps, size, replace=False))
self.ts1 = Series(np.random.randn(size), idx1)
@@ -321,7 +345,7 @@ def time_series_align_int64_index(self):
self.ts1 + self.ts2
def time_series_align_left_monotonic(self):
- self.ts1.align(self.ts2, join='left')
+ self.ts1.align(self.ts2, join="left")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index ca2bdc45dc2cb..eda059a68e8a5 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -2,50 +2,48 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import date_range, MultiIndex
+from pandas import date_range, MultiIndex, DataFrame
class GetLoc:
-
def setup(self):
self.mi_large = MultiIndex.from_product(
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
- names=['one', 'two', 'three'])
+ names=["one", "two", "three"],
+ )
self.mi_med = MultiIndex.from_product(
- [np.arange(1000), np.arange(10), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"]
+ )
self.mi_small = MultiIndex.from_product(
- [np.arange(100), list('A'), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(100), list("A"), list("A")], names=["one", "two", "three"]
+ )
def time_large_get_loc(self):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_large_get_loc_warm(self):
for _ in range(1000):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_med_get_loc(self):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_med_get_loc_warm(self):
for _ in range(1000):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_string_get_loc(self):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
def time_small_get_loc_warm(self):
for _ in range(1000):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
class Duplicates:
-
def setup(self):
size = 65536
- arrays = [np.random.randint(0, 8192, size),
- np.random.randint(0, 1024, size)]
+ arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)]
mask = np.random.rand(size) < 0.1
self.mi_unused_levels = MultiIndex.from_arrays(arrays)
self.mi_unused_levels = self.mi_unused_levels[mask]
@@ -55,15 +53,25 @@ def time_remove_unused_levels(self):
class Integer:
-
def setup(self):
- self.mi_int = MultiIndex.from_product([np.arange(1000),
- np.arange(1000)],
- names=['one', 'two'])
- self.obj_index = np.array([(0, 10), (0, 11), (0, 12),
- (0, 13), (0, 14), (0, 15),
- (0, 16), (0, 17), (0, 18),
- (0, 19)], dtype=object)
+ self.mi_int = MultiIndex.from_product(
+ [np.arange(1000), np.arange(1000)], names=["one", "two"]
+ )
+ self.obj_index = np.array(
+ [
+ (0, 10),
+ (0, 11),
+ (0, 12),
+ (0, 13),
+ (0, 14),
+ (0, 15),
+ (0, 16),
+ (0, 17),
+ (0, 18),
+ (0, 19),
+ ],
+ dtype=object,
+ )
def time_get_indexer(self):
self.mi_int.get_indexer(self.obj_index)
@@ -73,12 +81,9 @@ def time_is_monotonic(self):
class Duplicated:
-
def setup(self):
n, k = 200, 5000
- levels = [np.arange(n),
- tm.makeStringIndex(n).values,
- 1000 + np.arange(n)]
+ levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
codes = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, codes=codes)
@@ -87,12 +92,13 @@ def time_duplicated(self):
class Sortlevel:
-
def setup(self):
n = 1182720
low, high = -4096, 4096
- arrs = [np.repeat(np.random.randint(low, high, (n // k)), k)
- for k in [11, 7, 5, 3, 1]]
+ arrs = [
+ np.repeat(np.random.randint(low, high, (n // k)), k)
+ for k in [11, 7, 5, 3, 1]
+ ]
self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)]
a = np.repeat(np.arange(100), 1000)
@@ -111,11 +117,10 @@ def time_sortlevel_one(self):
class Values:
-
def setup_cache(self):
level1 = range(1000)
- level2 = date_range(start='1/1/2012', periods=100)
+ level2 = date_range(start="1/1/2012", periods=100)
mi = MultiIndex.from_product([level1, level2])
return mi
@@ -126,4 +131,19 @@ def time_datetime_level_values_sliced(self, mi):
mi[:10].values
+class CategoricalLevel:
+ def setup(self):
+
+ self.df = DataFrame(
+ {
+ "a": np.arange(1_000_000, dtype=np.int32),
+ "b": np.arange(1_000_000, dtype=np.int64),
+ "c": np.arange(1_000_000, dtype=float),
+ }
+ ).astype({"a": "category", "b": "category"})
+
+ def time_categorical_level(self):
+ self.df.set_index(["a", "b"])
+
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py
index 26e344758596f..31c3b6fb6cb60 100644
--- a/asv_bench/benchmarks/offset.py
+++ b/asv_bench/benchmarks/offset.py
@@ -3,42 +3,51 @@
import numpy as np
import pandas as pd
+
try:
import pandas.tseries.holiday # noqa
except ImportError:
pass
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
-# These offests currently raise a NotImplimentedError with .apply_index()
-non_apply = [pd.offsets.Day(),
- pd.offsets.BYearEnd(),
- pd.offsets.BYearBegin(),
- pd.offsets.BQuarterEnd(),
- pd.offsets.BQuarterBegin(),
- pd.offsets.BMonthEnd(),
- pd.offsets.BMonthBegin(),
- pd.offsets.CustomBusinessDay(),
- pd.offsets.CustomBusinessDay(calendar=hcal),
- pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal)]
-other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(),
- pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(),
- pd.offsets.MonthEnd(), pd.offsets.MonthBegin(),
- pd.offsets.DateOffset(months=2, days=2),
- pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(),
- pd.offsets.SemiMonthBegin()]
+# These offsets currently raise a NotImplimentedError with .apply_index()
+non_apply = [
+ pd.offsets.Day(),
+ pd.offsets.BYearEnd(),
+ pd.offsets.BYearBegin(),
+ pd.offsets.BQuarterEnd(),
+ pd.offsets.BQuarterBegin(),
+ pd.offsets.BMonthEnd(),
+ pd.offsets.BMonthBegin(),
+ pd.offsets.CustomBusinessDay(),
+ pd.offsets.CustomBusinessDay(calendar=hcal),
+ pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+]
+other_offsets = [
+ pd.offsets.YearEnd(),
+ pd.offsets.YearBegin(),
+ pd.offsets.QuarterEnd(),
+ pd.offsets.QuarterBegin(),
+ pd.offsets.MonthEnd(),
+ pd.offsets.MonthBegin(),
+ pd.offsets.DateOffset(months=2, days=2),
+ pd.offsets.BusinessDay(),
+ pd.offsets.SemiMonthEnd(),
+ pd.offsets.SemiMonthBegin(),
+]
offsets = non_apply + other_offsets
class ApplyIndex:
params = other_offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 10000
- self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_apply_index(self, offset):
offset.apply_index(self.rng)
@@ -47,13 +56,15 @@ def time_apply_index(self, offset):
class OnOffset:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- self.dates = [datetime(2016, m, d)
- for m in [10, 11, 12]
- for d in [1, 2, 3, 28, 29, 30, 31]
- if not (m == 11 and d == 31)]
+ self.dates = [
+ datetime(2016, m, d)
+ for m in [10, 11, 12]
+ for d in [1, 2, 3, 28, 29, 30, 31]
+ if not (m == 11 and d == 31)
+ ]
def time_on_offset(self, offset):
for date in self.dates:
@@ -63,11 +74,11 @@ def time_on_offset(self, offset):
class OffsetSeriesArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
self.data = pd.Series(rng)
def time_add_offset(self, offset):
@@ -78,11 +89,11 @@ def time_add_offset(self, offset):
class OffsetDatetimeIndexArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- self.data = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.data = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_add_offset(self, offset):
with warnings.catch_warnings(record=True):
@@ -92,11 +103,11 @@ def time_add_offset(self, offset):
class OffestDatetimeArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
self.date = datetime(2011, 1, 1)
- self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.dt64 = np.datetime64("2011-01-01 09:00Z")
def time_apply(self, offset):
offset.apply(self.date)
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
index 59b1638920666..fdc8207021c0f 100644
--- a/asv_bench/benchmarks/pandas_vb_common.py
+++ b/asv_bench/benchmarks/pandas_vb_common.py
@@ -5,26 +5,42 @@
import pandas as pd
# Compatibility import for lib
-for imp in ['pandas._libs.lib', 'pandas.lib']:
+for imp in ["pandas._libs.lib", "pandas.lib"]:
try:
lib = import_module(imp)
break
except (ImportError, TypeError, ValueError):
pass
-numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
- np.float64, np.int16, np.int8, np.uint16, np.uint8]
+numeric_dtypes = [
+ np.int64,
+ np.int32,
+ np.uint32,
+ np.uint64,
+ np.float32,
+ np.float64,
+ np.int16,
+ np.int8,
+ np.uint16,
+ np.uint8,
+]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [np.object]
try:
- extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
- pd.Int32Dtype, pd.Int64Dtype,
- pd.UInt8Dtype, pd.UInt16Dtype,
- pd.UInt32Dtype, pd.UInt64Dtype,
- pd.CategoricalDtype,
- pd.IntervalDtype,
- pd.DatetimeTZDtype('ns', 'UTC'),
- pd.PeriodDtype('D')]
+ extension_dtypes = [
+ pd.Int8Dtype,
+ pd.Int16Dtype,
+ pd.Int32Dtype,
+ pd.Int64Dtype,
+ pd.UInt8Dtype,
+ pd.UInt16Dtype,
+ pd.UInt32Dtype,
+ pd.UInt64Dtype,
+ pd.CategoricalDtype,
+ pd.IntervalDtype,
+ pd.DatetimeTZDtype("ns", "UTC"),
+ pd.PeriodDtype("D"),
+ ]
except AttributeError:
extension_dtypes = []
@@ -40,6 +56,7 @@ class BaseIO:
"""
Base class for IO benchmarks
"""
+
fname = None
def remove(self, f):
diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py
index c8ba6c382cb64..2f8ae0650ab75 100644
--- a/asv_bench/benchmarks/period.py
+++ b/asv_bench/benchmarks/period.py
@@ -1,18 +1,33 @@
-from pandas import (
- DataFrame, Period, PeriodIndex, Series, date_range, period_range)
+from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range
from pandas.tseries.frequencies import to_offset
class PeriodProperties:
- params = (['M', 'min'],
- ['year', 'month', 'day', 'hour', 'minute', 'second',
- 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth',
- 'dayofweek', 'dayofyear', 'start_time', 'end_time'])
- param_names = ['freq', 'attr']
+ params = (
+ ["M", "min"],
+ [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "is_leap_year",
+ "quarter",
+ "qyear",
+ "week",
+ "daysinmonth",
+ "dayofweek",
+ "dayofyear",
+ "start_time",
+ "end_time",
+ ],
+ )
+ param_names = ["freq", "attr"]
def setup(self, freq, attr):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_property(self, freq, attr):
getattr(self.per, attr)
@@ -20,11 +35,11 @@ def time_property(self, freq, attr):
class PeriodUnaryMethods:
- params = ['M', 'min']
- param_names = ['freq']
+ params = ["M", "min"]
+ param_names = ["freq"]
def setup(self, freq):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_to_timestamp(self, freq):
self.per.to_timestamp()
@@ -33,12 +48,12 @@ def time_now(self, freq):
self.per.now(freq)
def time_asfreq(self, freq):
- self.per.asfreq('A')
+ self.per.asfreq("A")
class PeriodConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
if is_offset:
@@ -47,20 +62,21 @@ def setup(self, freq, is_offset):
self.freq = freq
def time_period_constructor(self, freq, is_offset):
- Period('2012-06-01', freq=freq)
+ Period("2012-06-01", freq=freq)
class PeriodIndexConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
- self.rng = date_range('1985', periods=1000)
- self.rng2 = date_range('1985', periods=1000).to_pydatetime()
+ self.rng = date_range("1985", periods=1000)
+ self.rng2 = date_range("1985", periods=1000).to_pydatetime()
self.ints = list(range(2000, 3000))
- self.daily_ints = date_range('1/1/2000', periods=1000,
- freq=freq).strftime('%Y%m%d').map(int)
+ self.daily_ints = (
+ date_range("1/1/2000", periods=1000, freq=freq).strftime("%Y%m%d").map(int)
+ )
if is_offset:
self.freq = to_offset(freq)
else:
@@ -80,32 +96,35 @@ def time_from_ints_daily(self, freq, is_offset):
class DataFramePeriodColumn:
-
def setup(self):
- self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
+ self.rng = period_range(start="1/1/1990", freq="S", periods=20000)
self.df = DataFrame(index=range(len(self.rng)))
def time_setitem_period_column(self):
- self.df['col'] = self.rng
+ self.df["col"] = self.rng
def time_set_index(self):
# GH#21582 limited by comparisons of Period objects
- self.df['col2'] = self.rng
- self.df.set_index('col2', append=True)
+ self.df["col2"] = self.rng
+ self.df.set_index("col2", append=True)
class Algorithms:
- params = ['index', 'series']
- param_names = ['typ']
+ params = ["index", "series"]
+ param_names = ["typ"]
def setup(self, typ):
- data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
- Period('2011-03', freq='M'), Period('2011-04', freq='M')]
-
- if typ == 'index':
- self.vector = PeriodIndex(data * 1000, freq='M')
- elif typ == 'series':
+ data = [
+ Period("2011-01", freq="M"),
+ Period("2011-02", freq="M"),
+ Period("2011-03", freq="M"),
+ Period("2011-04", freq="M"),
+ ]
+
+ if typ == "index":
+ self.vector = PeriodIndex(data * 1000, freq="M")
+ elif typ == "series":
self.vector = Series(data * 1000)
def time_drop_duplicates(self, typ):
@@ -116,9 +135,8 @@ def time_value_counts(self, typ):
class Indexing:
-
def setup(self):
- self.index = period_range(start='1985', periods=1000, freq='D')
+ self.index = period_range(start="1985", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.period = self.index[500]
@@ -135,7 +153,7 @@ def time_series_loc(self):
self.series.loc[self.period]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
index 9e3bc87c32987..4fb0876f05a0a 100644
--- a/asv_bench/benchmarks/plotting.py
+++ b/asv_bench/benchmarks/plotting.py
@@ -1,27 +1,29 @@
import numpy as np
from pandas import DataFrame, Series, DatetimeIndex, date_range
+
try:
from pandas.plotting import andrews_curves
except ImportError:
from pandas.tools.plotting import andrews_curves
import matplotlib
-matplotlib.use('Agg')
+
+matplotlib.use("Agg")
class SeriesPlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']]
- param_names = ['kind']
+ params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde']:
+ elif kind in ["kde"]:
n = 10000
else:
n = 1000000
self.s = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.s = self.s.abs()
def time_series_plot(self, kind):
@@ -29,41 +31,43 @@ def time_series_plot(self, kind):
class FramePlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter',
- 'hexbin']]
- param_names = ['kind']
+ params = [
+ ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"]
+ ]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde', 'scatter', 'hexbin']:
+ elif kind in ["kde", "scatter", "hexbin"]:
n = 10000
else:
n = 1000000
self.x = Series(np.random.randn(n))
self.y = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.x = self.x.abs()
self.y = self.y.abs()
- self.df = DataFrame({'x': self.x, 'y': self.y})
+ self.df = DataFrame({"x": self.x, "y": self.y})
def time_frame_plot(self, kind):
- self.df.plot(x='x', y='y', kind=kind)
+ self.df.plot(x="x", y="y", kind=kind)
class TimeseriesPlotting:
-
def setup(self):
N = 2000
M = 5
- idx = date_range('1/1/1975', periods=N)
+ idx = date_range("1/1/1975", periods=N)
self.df = DataFrame(np.random.randn(N, M), index=idx)
- idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10],
- idx.values[12:])))
- self.df2 = DataFrame(np.random.randn(len(idx_irregular), M),
- index=idx_irregular)
+ idx_irregular = DatetimeIndex(
+ np.concatenate((idx.values[0:10], idx.values[12:]))
+ )
+ self.df2 = DataFrame(
+ np.random.randn(len(idx_irregular), M), index=idx_irregular
+ )
def time_plot_regular(self):
self.df.plot()
@@ -79,12 +83,11 @@ def time_plot_table(self):
class Misc:
-
def setup(self):
N = 500
M = 10
self.df = DataFrame(np.random.randn(N, M))
- self.df['Name'] = ["A"] * N
+ self.df["Name"] = ["A"] * N
def time_plot_andrews_curves(self):
andrews_curves(self.df, "Name")
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
index a6ceb0e93a089..8d4c9ebaf3e89 100644
--- a/asv_bench/benchmarks/reindex.py
+++ b/asv_bench/benchmarks/reindex.py
@@ -1,20 +1,18 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
- period_range)
+from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range
from .pandas_vb_common import lib
class Reindex:
-
def setup(self):
- rng = date_range(start='1/1/1970', periods=10000, freq='1min')
- self.df = DataFrame(np.random.rand(10000, 10), index=rng,
- columns=range(10))
- self.df['foo'] = 'bar'
+ rng = date_range(start="1/1/1970", periods=10000, freq="1min")
+ self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10))
+ self.df["foo"] = "bar"
self.rng_subset = Index(rng[::2])
- self.df2 = DataFrame(index=range(10000),
- data=np.random.rand(10000, 30), columns=range(30))
+ self.df2 = DataFrame(
+ index=range(10000), data=np.random.rand(10000, 30), columns=range(30)
+ )
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
@@ -35,12 +33,12 @@ def time_reindex_multiindex(self):
class ReindexMethod:
- params = [['pad', 'backfill'], [date_range, period_range]]
- param_names = ['method', 'constructor']
+ params = [["pad", "backfill"], [date_range, period_range]]
+ param_names = ["method", "constructor"]
def setup(self, method, constructor):
N = 100000
- self.idx = constructor('1/1/2000', periods=N, freq='1min')
+ self.idx = constructor("1/1/2000", periods=N, freq="1min")
self.ts = Series(np.random.randn(N), index=self.idx)[::2]
def time_reindex_method(self, method, constructor):
@@ -49,15 +47,15 @@ def time_reindex_method(self, method, constructor):
class Fillna:
- params = ['pad', 'backfill']
- param_names = ['method']
+ params = ["pad", "backfill"]
+ param_names = ["method"]
def setup(self, method):
N = 100000
- self.idx = date_range('1/1/2000', periods=N, freq='1min')
+ self.idx = date_range("1/1/2000", periods=N, freq="1min")
ts = Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
- self.ts_float32 = self.ts_reindexed.astype('float32')
+ self.ts_float32 = self.ts_reindexed.astype("float32")
def time_reindexed(self, method):
self.ts_reindexed.fillna(method=method)
@@ -67,17 +65,17 @@ def time_float_32(self, method):
class LevelAlign:
-
def setup(self):
self.index = MultiIndex(
levels=[np.arange(10), np.arange(100), np.arange(100)],
- codes=[np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)])
- self.df = DataFrame(np.random.randn(len(self.index), 4),
- index=self.index)
- self.df_level = DataFrame(np.random.randn(100, 4),
- index=self.index.levels[1])
+ codes=[
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ],
+ )
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
def time_align_level(self):
self.df.align(self.df_level, level=1, copy=False)
@@ -89,15 +87,16 @@ def time_reindex_level(self):
class DropDuplicates:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
- self.df = DataFrame({'key1': key1, 'key2': key2,
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {"key1": key1, "key2": key2, "value": np.random.randn(N * K)}
+ )
self.df_nan = self.df.copy()
self.df_nan.iloc[:10000, :] = np.nan
@@ -107,15 +106,14 @@ def setup(self, inplace):
N = 1000000
K = 10000
key1 = np.random.randint(0, K, size=N)
- self.df_int = DataFrame({'key1': key1})
- self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
- dtype=bool))
+ self.df_int = DataFrame({"key1": key1})
+ self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool))
def time_frame_drop_dups(self, inplace):
- self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_frame_drop_dups_na(self, inplace):
- self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df_nan.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_series_drop_dups_int(self, inplace):
self.s.drop_duplicates(inplace=inplace)
@@ -137,16 +135,16 @@ def setup(self):
indices = tm.makeStringIndex(n)
subsample_size = 40000
self.x = Series(np.random.randn(n), indices)
- self.y = Series(np.random.randn(subsample_size),
- index=np.random.choice(indices, subsample_size,
- replace=False))
+ self.y = Series(
+ np.random.randn(subsample_size),
+ index=np.random.choice(indices, subsample_size, replace=False),
+ )
def time_align_series_irregular_string(self):
self.x + self.y
class LibFastZip:
-
def setup(self):
N = 10000
K = 10
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
index 9dff1778f8e56..f69ae15028525 100644
--- a/asv_bench/benchmarks/replace.py
+++ b/asv_bench/benchmarks/replace.py
@@ -5,11 +5,11 @@
class FillNa:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**6
- rng = pd.date_range('1/1/2000', periods=N, freq='min')
+ N = 10 ** 6
+ rng = pd.date_range("1/1/2000", periods=N, freq="min")
data = np.random.randn(N)
data[::2] = np.nan
self.ts = pd.Series(data, index=rng)
@@ -24,28 +24,48 @@ def time_replace(self, inplace):
class ReplaceDict:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**5
- start_value = 10**5
+ N = 10 ** 5
+ start_value = 10 ** 5
self.to_rep = dict(enumerate(np.arange(N) + start_value))
- self.s = pd.Series(np.random.randint(N, size=10**3))
+ self.s = pd.Series(np.random.randint(N, size=10 ** 3))
def time_replace_series(self, inplace):
self.s.replace(self.to_rep, inplace=inplace)
+class ReplaceList:
+ # GH#28099
+
+ params = [(True, False)]
+ param_names = ["inplace"]
+
+ def setup(self, inplace):
+ self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7))
+
+ def time_replace_list(self, inplace):
+ self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace)
+
+ def time_replace_list_one_match(self, inplace):
+ # the 1 can be held in self._df.blocks[0], while the inf and -inf cant
+ self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace)
+
+
class Convert:
- params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
- param_names = ['constructor', 'replace_data']
+ params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"])
+ param_names = ["constructor", "replace_data"]
def setup(self, constructor, replace_data):
- N = 10**3
- data = {'Series': pd.Series(np.random.randint(N, size=N)),
- 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
- 'B': np.random.randint(N, size=N)})}
+ N = 10 ** 3
+ data = {
+ "Series": pd.Series(np.random.randint(N, size=N)),
+ "DataFrame": pd.DataFrame(
+ {"A": np.random.randint(N, size=N), "B": np.random.randint(N, size=N)}
+ ),
+ }
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
self.data = data[constructor]
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
index 678403d837805..cc373f413fb88 100644
--- a/asv_bench/benchmarks/reshape.py
+++ b/asv_bench/benchmarks/reshape.py
@@ -7,35 +7,33 @@
class Melt:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C'])
- self.df['id1'] = np.random.randint(0, 10, 10000)
- self.df['id2'] = np.random.randint(100, 1000, 10000)
+ self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"])
+ self.df["id1"] = np.random.randint(0, 10, 10000)
+ self.df["id2"] = np.random.randint(100, 1000, 10000)
def time_melt_dataframe(self):
- melt(self.df, id_vars=['id1', 'id2'])
+ melt(self.df, id_vars=["id1", "id2"])
class Pivot:
-
def setup(self):
N = 10000
- index = date_range('1/1/2000', periods=N, freq='h')
- data = {'value': np.random.randn(N * 50),
- 'variable': np.arange(50).repeat(N),
- 'date': np.tile(index.values, 50)}
+ index = date_range("1/1/2000", periods=N, freq="h")
+ data = {
+ "value": np.random.randn(N * 50),
+ "variable": np.arange(50).repeat(N),
+ "date": np.tile(index.values, 50),
+ }
self.df = DataFrame(data)
def time_reshape_pivot_time_series(self):
- self.df.pivot('date', 'variable', 'value')
+ self.df.pivot("date", "variable", "value")
class SimpleReshape:
-
def setup(self):
- arrays = [np.arange(100).repeat(100),
- np.roll(np.tile(np.arange(100), 100), 25)]
+ arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]
index = MultiIndex.from_arrays(arrays)
self.df = DataFrame(np.random.randn(10000, 4), index=index)
self.udf = self.df.unstack(1)
@@ -49,7 +47,7 @@ def time_unstack(self):
class Unstack:
- params = ['int', 'category']
+ params = ["int", "category"]
def setup(self, dtype):
m = 100
@@ -58,7 +56,7 @@ def setup(self, dtype):
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
columns = np.arange(n)
- if dtype == 'int':
+ if dtype == "int":
values = np.arange(m * m * n).reshape(m * m, n)
else:
# the category branch is ~20x slower than int. So we
@@ -80,84 +78,94 @@ def time_without_last_row(self, dtype):
class SparseIndex:
-
def setup(self):
NUM_ROWS = 1000
- self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS),
- 'B': np.random.randint(50, size=NUM_ROWS),
- 'C': np.random.randint(-10, 10, size=NUM_ROWS),
- 'D': np.random.randint(-10, 10, size=NUM_ROWS),
- 'E': np.random.randint(10, size=NUM_ROWS),
- 'F': np.random.randn(NUM_ROWS)})
- self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E'])
+ self.df = DataFrame(
+ {
+ "A": np.random.randint(50, size=NUM_ROWS),
+ "B": np.random.randint(50, size=NUM_ROWS),
+ "C": np.random.randint(-10, 10, size=NUM_ROWS),
+ "D": np.random.randint(-10, 10, size=NUM_ROWS),
+ "E": np.random.randint(10, size=NUM_ROWS),
+ "F": np.random.randn(NUM_ROWS),
+ }
+ )
+ self.df = self.df.set_index(["A", "B", "C", "D", "E"])
def time_unstack(self):
self.df.unstack()
class WideToLong:
-
def setup(self):
nyrs = 20
nidvars = 20
N = 5000
- self.letters = list('ABCD')
- yrvars = [l + str(num)
- for l, num in product(self.letters, range(1, nyrs + 1))]
+ self.letters = list("ABCD")
+ yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
columns = [str(i) for i in range(nidvars)] + yrvars
- self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)),
- columns=columns)
- self.df['id'] = self.df.index
+ self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
+ self.df["id"] = self.df.index
def time_wide_to_long_big(self):
- wide_to_long(self.df, self.letters, i='id', j='year')
+ wide_to_long(self.df, self.letters, i="id", j="year")
class PivotTable:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
ind1 = np.random.randint(0, 3, size=N)
ind2 = np.random.randint(0, 2, size=N)
- self.df = DataFrame({'key1': fac1.take(ind1),
- 'key2': fac2.take(ind2),
- 'key3': fac2.take(ind2),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
- self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
- 'col3': [1, 2, 3, 4, 5]})
- self.df2.col1 = self.df2.col1.astype('category')
- self.df2.col2 = self.df2.col2.astype('category')
+ self.df = DataFrame(
+ {
+ "key1": fac1.take(ind1),
+ "key2": fac2.take(ind2),
+ "key3": fac2.take(ind2),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
+ self.df2 = DataFrame(
+ {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
+ )
+ self.df2.col1 = self.df2.col1.astype("category")
+ self.df2.col2 = self.df2.col2.astype("category")
def time_pivot_table(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'])
+ self.df.pivot_table(index="key1", columns=["key2", "key3"])
def time_pivot_table_agg(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- aggfunc=['sum', 'mean'])
+ self.df.pivot_table(
+ index="key1", columns=["key2", "key3"], aggfunc=["sum", "mean"]
+ )
def time_pivot_table_margins(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- margins=True)
+ self.df.pivot_table(index="key1", columns=["key2", "key3"], margins=True)
def time_pivot_table_categorical(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0)
+ self.df2.pivot_table(
+ index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0
+ )
def time_pivot_table_categorical_observed(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0, observed=True)
+ self.df2.pivot_table(
+ index="col1",
+ values="col3",
+ columns="col2",
+ aggfunc=np.sum,
+ fill_value=0,
+ observed=True,
+ )
class Crosstab:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
self.ind1 = np.random.randint(0, 3, size=N)
self.ind2 = np.random.randint(0, 2, size=N)
self.vec1 = fac1.take(self.ind1)
@@ -167,7 +175,7 @@ def time_crosstab(self):
pd.crosstab(self.vec1, self.vec2)
def time_crosstab_values(self):
- pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')
+ pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc="sum")
def time_crosstab_normalize(self):
pd.crosstab(self.vec1, self.vec2, normalize=True)
@@ -179,8 +187,10 @@ def time_crosstab_normalize_margins(self):
class GetDummies:
def setup(self):
categories = list(string.ascii_letters[:12])
- s = pd.Series(np.random.choice(categories, size=1000000),
- dtype=pd.api.types.CategoricalDtype(categories))
+ s = pd.Series(
+ np.random.choice(categories, size=1000000),
+ dtype=pd.api.types.CategoricalDtype(categories),
+ )
self.s = s
def time_get_dummies_1d(self):
@@ -192,16 +202,19 @@ def time_get_dummies_1d_sparse(self):
class Cut:
params = [[4, 10, 1000]]
- param_names = ['bins']
+ param_names = ["bins"]
def setup(self, bins):
- N = 10**5
+ N = 10 ** 5
self.int_series = pd.Series(np.arange(N).repeat(5))
self.float_series = pd.Series(np.random.randn(N).repeat(5))
- self.timedelta_series = pd.Series(np.random.randint(N, size=N),
- dtype='timedelta64[ns]')
- self.datetime_series = pd.Series(np.random.randint(N, size=N),
- dtype='datetime64[ns]')
+ self.timedelta_series = pd.Series(
+ np.random.randint(N, size=N), dtype="timedelta64[ns]"
+ )
+ self.datetime_series = pd.Series(
+ np.random.randint(N, size=N), dtype="datetime64[ns]"
+ )
+ self.interval_bins = pd.IntervalIndex.from_breaks(np.linspace(0, N, bins))
def time_cut_int(self, bins):
pd.cut(self.int_series, bins)
@@ -227,5 +240,26 @@ def time_qcut_timedelta(self, bins):
def time_qcut_datetime(self, bins):
pd.qcut(self.datetime_series, bins)
+ def time_cut_interval(self, bins):
+ # GH 27668
+ pd.cut(self.int_series, self.interval_bins)
+
+ def peakmem_cut_interval(self, bins):
+ # GH 27668
+ pd.cut(self.int_series, self.interval_bins)
+
+
+class Explode:
+ param_names = ["n_rows", "max_list_length"]
+ params = [[100, 1000, 10000], [3, 5, 10]]
+
+ def setup(self, n_rows, max_list_length):
+
+ data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]
+ self.series = pd.Series(data)
+
+ def time_explode(self, n_rows, max_list_length):
+ self.series.explode()
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 033b466c8b9be..a70977fcf539f 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -4,15 +4,16 @@
class Methods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
@@ -22,14 +23,15 @@ def time_rolling(self, constructor, window, dtype, method):
class ExpandingMethods:
- params = (['DataFrame', 'Series'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.expanding = getattr(pd, constructor)(arr).expanding()
@@ -39,14 +41,11 @@ def time_expanding(self, constructor, dtype, method):
class EWMMethods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['mean', 'std'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"])
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window)
@@ -55,29 +54,28 @@ def time_ewm(self, constructor, window, dtype, method):
class VariableWindowMethods(Methods):
- params = (['DataFrame', 'Series'],
- ['50s', '1h', '1d'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["50s", "1h", "1d"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
- index = pd.date_range('2017-01-01', periods=N, freq='5s')
+ index = pd.date_range("2017-01-01", periods=N, freq="5s")
self.roll = getattr(pd, constructor)(arr, index=index).rolling(window)
class Pairwise:
- params = ([10, 1000, None],
- ['corr', 'cov'],
- [True, False])
- param_names = ['window', 'method', 'pairwise']
+ params = ([10, 1000, None], ["corr", "cov"], [True, False])
+ param_names = ["window", "method", "pairwise"]
def setup(self, window, method, pairwise):
- N = 10**4
+ N = 10 ** 4
arr = np.random.random(N)
self.df = pd.DataFrame(arr)
@@ -90,25 +88,25 @@ def time_pairwise(self, window, method, pairwise):
class Quantile:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- [0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'])
- param_names = ['constructor', 'window', 'dtype', 'percentile']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ )
+ param_names = ["constructor", "window", "dtype", "percentile"]
def setup(self, constructor, window, dtype, percentile, interpolation):
N = 10 ** 5
arr = np.random.random(N).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
- def time_quantile(self, constructor, window, dtype, percentile,
- interpolation):
+ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
self.roll.quantile(percentile, interpolation=interpolation)
class PeakMemFixed:
-
def setup(self):
N = 10
arr = 100 * np.random.random(N)
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 4b1af2dc8c932..6038a2ab4bd9f 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -7,13 +7,13 @@
class SeriesConstructor:
- params = [None, 'dict']
- param_names = ['data']
+ params = [None, "dict"]
+ param_names = ["data"]
def setup(self, data):
- self.idx = date_range(start=datetime(2015, 10, 26),
- end=datetime(2016, 1, 1),
- freq='50s')
+ self.idx = date_range(
+ start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
+ )
dict_data = dict(zip(self.idx, range(len(self.idx))))
self.data = None if data is None else dict_data
@@ -23,8 +23,8 @@ def time_constructor(self, data):
class IsIn:
- params = ['int64', 'uint64', 'object']
- param_names = ['dtype']
+ params = ["int64", "uint64", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
@@ -35,12 +35,11 @@ def time_isin(self, dtypes):
class IsInFloat64:
-
def setup(self):
self.small = Series([1, 2], dtype=np.float64)
- self.many_different_values = np.arange(10**6, dtype=np.float64)
- self.few_different_values = np.zeros(10**7, dtype=np.float64)
- self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
+ self.many_different_values = np.arange(10 ** 6, dtype=np.float64)
+ self.few_different_values = np.zeros(10 ** 7, dtype=np.float64)
+ self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64)
def time_isin_many_different(self):
# runtime is dominated by creation of the lookup-table
@@ -56,19 +55,18 @@ def time_isin_nan_values(self):
class IsInForObjects:
-
def setup(self):
- self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object)
- self.vals_nans = np.full(10**4, np.nan).astype(np.object)
+ self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object)
+ self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object)
self.s_short = Series(np.arange(2)).astype(np.object)
- self.s_long = Series(np.arange(10**5)).astype(np.object)
+ self.s_long = Series(np.arange(10 ** 5)).astype(np.object)
self.vals_short = np.arange(2).astype(np.object)
- self.vals_long = np.arange(10**5).astype(np.object)
+ self.vals_long = np.arange(10 ** 5).astype(np.object)
# because of nans floats are special:
- self.s_long_floats = Series(np.arange(10**5,
- dtype=np.float)).astype(np.object)
- self.vals_long_floats = np.arange(10**5,
- dtype=np.float).astype(np.object)
+ self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(
+ np.object
+ )
+ self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object)
def time_isin_nans(self):
# if nan-objects are different objects,
@@ -94,8 +92,8 @@ def time_isin_long_series_long_values_floats(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
self.s = Series(np.random.randint(1, 10, 100000))
@@ -109,15 +107,17 @@ def time_nsmallest(self, keep):
class Dropna:
- params = ['int', 'datetime']
- param_names = ['dtype']
+ params = ["int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- data = {'int': np.random.randint(1, 10, N),
- 'datetime': date_range('2000-01-01', freq='S', periods=N)}
+ N = 10 ** 6
+ data = {
+ "int": np.random.randint(1, 10, N),
+ "datetime": date_range("2000-01-01", freq="S", periods=N),
+ }
self.s = Series(data[dtype])
- if dtype == 'datetime':
+ if dtype == "datetime":
self.s[np.random.randint(1, N, 100)] = NaT
def time_dropna(self, dtype):
@@ -127,37 +127,47 @@ def time_dropna(self, dtype):
class SearchSorted:
goal_time = 0.2
- params = ['int8', 'int16', 'int32', 'int64',
- 'uint8', 'uint16', 'uint32', 'uint64',
- 'float16', 'float32', 'float64',
- 'str']
- param_names = ['dtype']
+ params = [
+ "int8",
+ "int16",
+ "int32",
+ "int64",
+ "uint8",
+ "uint16",
+ "uint32",
+ "uint64",
+ "float16",
+ "float32",
+ "float64",
+ "str",
+ ]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
self.s = Series(data)
def time_searchsorted(self, dtype):
- key = '2' if dtype == 'str' else 2
+ key = "2" if dtype == "str" else 2
self.s.searchsorted(key)
class Map:
- params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int'])
- param_names = 'mapper'
+ params = (["dict", "Series", "lambda"], ["object", "category", "int"])
+ param_names = "mapper"
def setup(self, mapper, dtype):
map_size = 1000
map_data = Series(map_size - np.arange(map_size), dtype=dtype)
# construct mapper
- if mapper == 'Series':
+ if mapper == "Series":
self.map_data = map_data
- elif mapper == 'dict':
+ elif mapper == "dict":
self.map_data = map_data.to_dict()
- elif mapper == 'lambda':
+ elif mapper == "lambda":
map_dict = map_data.to_dict()
self.map_data = lambda x: map_dict[x]
else:
@@ -170,8 +180,8 @@ def time_map(self, mapper, *args, **kwargs):
class Clip:
- params = [50, 1000, 10**5]
- param_names = ['n']
+ params = [50, 1000, 10 ** 5]
+ param_names = ["n"]
def setup(self, n):
self.s = Series(np.random.randn(n))
@@ -182,8 +192,8 @@ def time_clip(self, n):
class ValueCounts:
- params = ['int', 'uint', 'float', 'object']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)
@@ -193,7 +203,6 @@ def time_value_counts(self, dtype):
class Dir:
-
def setup(self):
self.s = Series(index=tm.makeStringIndex(10000))
@@ -204,47 +213,59 @@ def time_dir_strings(self):
class SeriesGetattr:
# https://github.com/pandas-dev/pandas/issues/19764
def setup(self):
- self.s = Series(1,
- index=date_range("2012-01-01", freq='s',
- periods=int(1e6)))
+ self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6)))
def time_series_datetimeindex_repr(self):
- getattr(self.s, 'a', None)
+ getattr(self.s, "a", None)
-class All(object):
+class All:
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case != 'fast'
+ val = case != "fast"
self.s = Series([val] * N)
def time_all(self, N, case):
self.s.all()
-class Any(object):
+class Any:
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case == 'fast'
+ val = case == "fast"
self.s = Series([val] * N)
def time_any(self, N, case):
self.s.any()
-class NanOps(object):
-
- params = [['var', 'mean', 'median', 'max', 'min', 'sum', 'std', 'sem',
- 'argmax', 'skew', 'kurt', 'prod'],
- [10**3, 10**6],
- ['int8', 'int32', 'int64', 'float64']]
- param_names = ['func', 'N', 'dtype']
+class NanOps:
+
+ params = [
+ [
+ "var",
+ "mean",
+ "median",
+ "max",
+ "min",
+ "sum",
+ "std",
+ "sem",
+ "argmax",
+ "skew",
+ "kurt",
+ "prod",
+ ],
+ [10 ** 3, 10 ** 6],
+ ["int8", "int32", "int64", "float64"],
+ ]
+ param_names = ["func", "N", "dtype"]
def setup(self, func, N, dtype):
self.s = Series([1] * N, dtype=dtype)
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index ca4469e64c335..19d08c086a508 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -1,9 +1,8 @@
-import itertools
-
import numpy as np
import scipy.sparse
-from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series,
- date_range, MultiIndex)
+
+import pandas as pd
+from pandas import MultiIndex, Series, SparseArray, date_range
def make_array(size, dense_proportion, fill_value, dtype):
@@ -15,30 +14,28 @@ def make_array(size, dense_proportion, fill_value, dtype):
class SparseSeriesToFrame:
-
def setup(self):
K = 50
N = 50001
- rng = date_range('1/1/2000', periods=N, freq='T')
+ rng = date_range("1/1/2000", periods=N, freq="T")
self.series = {}
for i in range(1, K):
data = np.random.randn(N)[:-i]
idx = rng[:-i]
data[100:] = np.nan
- self.series[i] = SparseSeries(data, index=idx)
+ self.series[i] = pd.Series(pd.SparseArray(data), index=idx)
def time_series_to_frame(self):
- SparseDataFrame(self.series)
+ pd.DataFrame(self.series)
class SparseArrayConstructor:
- params = ([0.1, 0.01], [0, np.nan],
- [np.int64, np.float64, np.object])
- param_names = ['dense_proportion', 'fill_value', 'dtype']
+ params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object])
+ param_names = ["dense_proportion", "fill_value", "dtype"]
def setup(self, dense_proportion, fill_value, dtype):
- N = 10**6
+ N = 10 ** 6
self.array = make_array(N, dense_proportion, fill_value, dtype)
def time_sparse_array(self, dense_proportion, fill_value, dtype):
@@ -46,57 +43,45 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
class SparseDataFrameConstructor:
-
def setup(self):
N = 1000
self.arr = np.arange(N)
self.sparse = scipy.sparse.rand(N, N, 0.005)
- self.dict = dict(zip(range(N), itertools.repeat([0])))
-
- def time_constructor(self):
- SparseDataFrame(columns=self.arr, index=self.arr)
def time_from_scipy(self):
- SparseDataFrame(self.sparse)
-
- def time_from_dict(self):
- SparseDataFrame(self.dict)
+ pd.DataFrame.sparse.from_spmatrix(self.sparse)
class FromCoo:
-
def setup(self):
- self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0],
- ([1, 0, 0], [0, 2, 3])),
- shape=(100, 100))
+ self.matrix = scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)
+ )
def time_sparse_series_from_coo(self):
- SparseSeries.from_coo(self.matrix)
+ pd.Series.sparse.from_coo(self.matrix)
class ToCoo:
-
def setup(self):
s = Series([np.nan] * 10000)
s[0] = 3.0
s[100] = -1.0
s[999] = 12.1
s.index = MultiIndex.from_product([range(10)] * 4)
- self.ss = s.to_sparse()
+ self.ss = s.astype("Sparse")
def time_sparse_series_to_coo(self):
- self.ss.to_coo(row_levels=[0, 1],
- column_levels=[2, 3],
- sort_labels=True)
+ self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
class Arithmetic:
params = ([0.1, 0.01], [0, np.nan])
- param_names = ['dense_proportion', 'fill_value']
+ param_names = ["dense_proportion", "fill_value"]
def setup(self, dense_proportion, fill_value):
- N = 10**6
+ N = 10 ** 6
arr1 = make_array(N, dense_proportion, fill_value, np.int64)
self.array1 = SparseArray(arr1, fill_value=fill_value)
arr2 = make_array(N, dense_proportion, fill_value, np.int64)
@@ -118,22 +103,24 @@ def time_divide(self, dense_proportion, fill_value):
class ArithmeticBlock:
params = [np.nan, 0]
- param_names = ['fill_value']
+ param_names = ["fill_value"]
def setup(self, fill_value):
- N = 10**6
- self.arr1 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
- self.arr2 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
+ N = 10 ** 6
+ self.arr1 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
+ self.arr2 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
def make_block_array(self, length, num_blocks, block_size, fill_value):
arr = np.full(length, fill_value)
- indicies = np.random.choice(np.arange(0, length, block_size),
- num_blocks,
- replace=False)
+ indicies = np.random.choice(
+ np.arange(0, length, block_size), num_blocks, replace=False
+ )
for ind in indicies:
- arr[ind:ind + block_size] = np.random.randint(0, 100, block_size)
+ arr[ind : ind + block_size] = np.random.randint(0, 100, block_size)
return SparseArray(arr, fill_value=fill_value)
def time_make_union(self, fill_value):
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 3514335f92e77..620a6de0f5f34 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -2,14 +2,13 @@
import pandas as pd
-ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem',
- 'var']
+ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"]
class FrameOps:
- params = [ops, ['float', 'int'], [0, 1], [True, False]]
- param_names = ['op', 'dtype', 'axis', 'use_bottleneck']
+ params = [ops, ["float", "int"], [0, 1], [True, False]]
+ param_names = ["op", "dtype", "axis", "use_bottleneck"]
def setup(self, op, dtype, axis, use_bottleneck):
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
@@ -17,6 +16,7 @@ def setup(self, op, dtype, axis, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df_func = getattr(df, op)
@@ -27,13 +27,15 @@ def time_op(self, op, dtype, axis, use_bottleneck):
class FrameMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
df = pd.DataFrame(np.random.randn(len(index), 4), index=index)
self.df_func = getattr(df, op)
@@ -44,8 +46,8 @@ def time_op(self, level, op):
class SeriesOps:
- params = [ops, ['float', 'int'], [True, False]]
- param_names = ['op', 'dtype', 'use_bottleneck']
+ params = [ops, ["float", "int"], [True, False]]
+ param_names = ["op", "dtype", "use_bottleneck"]
def setup(self, op, dtype, use_bottleneck):
s = pd.Series(np.random.randn(100000)).astype(dtype)
@@ -53,6 +55,7 @@ def setup(self, op, dtype, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s_func = getattr(s, op)
@@ -63,13 +66,15 @@ def time_op(self, op, dtype, use_bottleneck):
class SeriesMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
s = pd.Series(np.random.randn(len(index)), index=index)
self.s_func = getattr(s, op)
@@ -80,11 +85,11 @@ def time_op(self, level, op):
class Rank:
- params = [['DataFrame', 'Series'], [True, False]]
- param_names = ['constructor', 'pct']
+ params = [["DataFrame", "Series"], [True, False]]
+ param_names = ["constructor", "pct"]
def setup(self, constructor, pct):
- values = np.random.randn(10**5)
+ values = np.random.randn(10 ** 5)
self.data = getattr(pd, constructor)(values)
def time_rank(self, constructor, pct):
@@ -96,14 +101,15 @@ def time_average_old(self, constructor, pct):
class Correlation:
- params = [['spearman', 'kendall', 'pearson'], [True, False]]
- param_names = ['method', 'use_bottleneck']
+ params = [["spearman", "kendall", "pearson"], [True, False]]
+ param_names = ["method", "use_bottleneck"]
def setup(self, method, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df = pd.DataFrame(np.random.randn(1000, 30))
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
@@ -126,13 +132,14 @@ def time_corrwith_rows(self, method, use_bottleneck):
class Covariance:
params = [[True, False]]
- param_names = ['use_bottleneck']
+ param_names = ["use_bottleneck"]
def setup(self, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s = pd.Series(np.random.randn(100000))
self.s2 = pd.Series(np.random.randn(100000))
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 5dbcc71b7455e..6be2fa92d9eac 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -6,31 +6,30 @@
class Methods:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_center(self):
self.s.str.center(100)
def time_count(self):
- self.s.str.count('A')
+ self.s.str.count("A")
def time_endswith(self):
- self.s.str.endswith('A')
+ self.s.str.endswith("A")
def time_extract(self):
with warnings.catch_warnings(record=True):
- self.s.str.extract('(\\w*)A(\\w*)')
+ self.s.str.extract("(\\w*)A(\\w*)")
def time_findall(self):
- self.s.str.findall('[A-Z]+')
+ self.s.str.findall("[A-Z]+")
def time_find(self):
- self.s.str.find('[A-Z]+')
+ self.s.str.find("[A-Z]+")
def time_rfind(self):
- self.s.str.rfind('[A-Z]+')
+ self.s.str.rfind("[A-Z]+")
def time_get(self):
self.s.str.get(0)
@@ -39,43 +38,43 @@ def time_len(self):
self.s.str.len()
def time_join(self):
- self.s.str.join(' ')
+ self.s.str.join(" ")
def time_match(self):
- self.s.str.match('A')
+ self.s.str.match("A")
def time_normalize(self):
- self.s.str.normalize('NFC')
+ self.s.str.normalize("NFC")
def time_pad(self):
- self.s.str.pad(100, side='both')
+ self.s.str.pad(100, side="both")
def time_partition(self):
- self.s.str.partition('A')
+ self.s.str.partition("A")
def time_rpartition(self):
- self.s.str.rpartition('A')
+ self.s.str.rpartition("A")
def time_replace(self):
- self.s.str.replace('A', '\x01\x01')
+ self.s.str.replace("A", "\x01\x01")
def time_translate(self):
- self.s.str.translate({'A': '\x01\x01'})
+ self.s.str.translate({"A": "\x01\x01"})
def time_slice(self):
self.s.str.slice(5, 15, 2)
def time_startswith(self):
- self.s.str.startswith('A')
+ self.s.str.startswith("A")
def time_strip(self):
- self.s.str.strip('A')
+ self.s.str.strip("A")
def time_rstrip(self):
- self.s.str.rstrip('A')
+ self.s.str.rstrip("A")
def time_lstrip(self):
- self.s.str.lstrip('A')
+ self.s.str.lstrip("A")
def time_title(self):
self.s.str.title()
@@ -95,13 +94,13 @@ def time_zfill(self):
class Repeat:
- params = ['int', 'array']
- param_names = ['repeats']
+ params = ["int", "array"]
+ param_names = ["repeats"]
def setup(self, repeats):
- N = 10**5
+ N = 10 ** 5
self.s = Series(tm.makeStringIndex(N))
- repeat = {'int': 1, 'array': np.random.randint(1, 3, N)}
+ repeat = {"int": 1, "array": np.random.randint(1, 3, N)}
self.values = repeat[repeats]
def time_repeat(self, repeats):
@@ -110,20 +109,20 @@ def time_repeat(self, repeats):
class Cat:
- params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15])
- param_names = ['other_cols', 'sep', 'na_rep', 'na_frac']
+ params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15])
+ param_names = ["other_cols", "sep", "na_rep", "na_frac"]
def setup(self, other_cols, sep, na_rep, na_frac):
N = 10 ** 5
- mask_gen = lambda: np.random.choice([True, False], N,
- p=[1 - na_frac, na_frac])
+ mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac])
self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
if other_cols == 0:
# str.cat self-concatenates only for others=None
self.others = None
else:
- self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
- for i in range(other_cols)})
+ self.others = DataFrame(
+ {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)}
+ )
def time_cat(self, other_cols, sep, na_rep, na_frac):
# before the concatenation (one caller + other_cols columns), the total
@@ -136,52 +135,49 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
class Contains:
params = [True, False]
- param_names = ['regex']
+ param_names = ["regex"]
def setup(self, regex):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_contains(self, regex):
- self.s.str.contains('A', regex=regex)
+ self.s.str.contains("A", regex=regex)
class Split:
params = [True, False]
- param_names = ['expand']
+ param_names = ["expand"]
def setup(self, expand):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('--')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
def time_split(self, expand):
- self.s.str.split('--', expand=expand)
+ self.s.str.split("--", expand=expand)
def time_rsplit(self, expand):
- self.s.str.rsplit('--', expand=expand)
+ self.s.str.rsplit("--", expand=expand)
class Dummies:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('|')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|")
def time_get_dummies(self):
- self.s.str.get_dummies('|')
+ self.s.str.get_dummies("|")
class Encode:
-
def setup(self):
self.ser = Series(tm.makeUnicodeIndex())
def time_encode_decode(self):
- self.ser.str.encode('utf-8').str.decode('utf-8')
+ self.ser.str.encode("utf-8").str.decode("utf-8")
class Slice:
-
def setup(self):
- self.s = Series(['abcdefg', np.nan] * 500000)
+ self.s = Series(["abcdefg", np.nan] * 500000)
def time_vector_slice(self):
# GH 2602
diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
index c4fe462944a2a..36a9db529f98f 100644
--- a/asv_bench/benchmarks/timedelta.py
+++ b/asv_bench/benchmarks/timedelta.py
@@ -3,49 +3,60 @@
import numpy as np
from pandas import (
- DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta)
+ DataFrame,
+ Series,
+ Timedelta,
+ Timestamp,
+ timedelta_range,
+ to_timedelta,
+)
class TimedeltaConstructor:
-
def time_from_int(self):
Timedelta(123456789)
def time_from_unit(self):
- Timedelta(1, unit='d')
+ Timedelta(1, unit="d")
def time_from_components(self):
- Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5,
- microseconds=6, nanoseconds=7)
+ Timedelta(
+ days=1,
+ hours=2,
+ minutes=3,
+ seconds=4,
+ milliseconds=5,
+ microseconds=6,
+ nanoseconds=7,
+ )
def time_from_datetime_timedelta(self):
Timedelta(datetime.timedelta(days=1, seconds=1))
def time_from_np_timedelta(self):
- Timedelta(np.timedelta64(1, 'ms'))
+ Timedelta(np.timedelta64(1, "ms"))
def time_from_string(self):
- Timedelta('1 days')
+ Timedelta("1 days")
def time_from_iso_format(self):
- Timedelta('P4DT12H30M5S')
+ Timedelta("P4DT12H30M5S")
def time_from_missing(self):
- Timedelta('nat')
+ Timedelta("nat")
class ToTimedelta:
-
def setup(self):
self.ints = np.random.randint(0, 60, size=10000)
self.str_days = []
self.str_seconds = []
for i in self.ints:
- self.str_days.append('{0} days'.format(i))
- self.str_seconds.append('00:00:{0:02d}'.format(i))
+ self.str_days.append("{0} days".format(i))
+ self.str_seconds.append("00:00:{0:02d}".format(i))
def time_convert_int(self):
- to_timedelta(self.ints, unit='s')
+ to_timedelta(self.ints, unit="s")
def time_convert_string_days(self):
to_timedelta(self.str_days)
@@ -56,30 +67,28 @@ def time_convert_string_seconds(self):
class ToTimedeltaErrors:
- params = ['coerce', 'ignore']
- param_names = ['errors']
+ params = ["coerce", "ignore"]
+ param_names = ["errors"]
def setup(self, errors):
ints = np.random.randint(0, 60, size=10000)
- self.arr = ['{0} days'.format(i) for i in ints]
- self.arr[-1] = 'apple'
+ self.arr = ["{0} days".format(i) for i in ints]
+ self.arr[-1] = "apple"
def time_convert(self, errors):
to_timedelta(self.arr, errors=errors)
class TimedeltaOps:
-
def setup(self):
self.td = to_timedelta(np.arange(1000000))
- self.ts = Timestamp('2000')
+ self.ts = Timestamp("2000")
def time_add_td_ts(self):
self.td + self.ts
class TimedeltaProperties:
-
def setup_cache(self):
td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35)
return td
@@ -98,10 +107,9 @@ def time_timedelta_nanoseconds(self, td):
class DatetimeAccessor:
-
def setup_cache(self):
N = 100000
- series = Series(timedelta_range('1 days', periods=N, freq='h'))
+ series = Series(timedelta_range("1 days", periods=N, freq="h"))
return series
def time_dt_accessor(self, series):
@@ -121,10 +129,9 @@ def time_timedelta_nanoseconds(self, series):
class TimedeltaIndexing:
-
def setup(self):
- self.index = timedelta_range(start='1985', periods=1000, freq='D')
- self.index2 = timedelta_range(start='1986', periods=1000, freq='D')
+ self.index = timedelta_range(start="1985", periods=1000, freq="D")
+ self.index2 = timedelta_range(start="1986", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.timedelta = self.index[500]
@@ -141,7 +148,7 @@ def time_series_loc(self):
self.series.loc[self.timedelta]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index.intersection(self.index2)
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index eea1df35c7711..1020b773f8acb 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -4,35 +4,31 @@
import numpy as np
from pandas import to_datetime, date_range, Series, DataFrame, period_range
from pandas.tseries.frequencies import infer_freq
+
try:
- from pandas.plotting._converter import DatetimeConverter
+ from pandas.plotting._matplotlib.converter import DatetimeConverter
except ImportError:
from pandas.tseries.converter import DatetimeConverter
class DatetimeIndex:
- params = ['dst', 'repeated', 'tz_aware', 'tz_local', 'tz_naive']
- param_names = ['index_type']
+ params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"]
+ param_names = ["index_type"]
def setup(self, index_type):
N = 100000
- dtidxes = {'dst': date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S'),
- 'repeated': date_range(start='2000',
- periods=N / 10,
- freq='s').repeat(10),
- 'tz_aware': date_range(start='2000',
- periods=N,
- freq='s',
- tz='US/Eastern'),
- 'tz_local': date_range(start='2000',
- periods=N,
- freq='s',
- tz=dateutil.tz.tzlocal()),
- 'tz_naive': date_range(start='2000',
- periods=N,
- freq='s')}
+ dtidxes = {
+ "dst": date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ ),
+ "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10),
+ "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"),
+ "tz_local": date_range(
+ start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal()
+ ),
+ "tz_naive": date_range(start="2000", periods=N, freq="s"),
+ }
self.index = dtidxes[index_type]
def time_add_timedelta(self, index_type):
@@ -62,31 +58,31 @@ def time_to_pydatetime(self, index_type):
class TzLocalize:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
- dst_rng = date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S')
- self.index = date_range(start='10/29/2000',
- end='10/29/2000 00:59:59', freq='S')
+ dst_rng = date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ )
+ self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S")
self.index = self.index.append(dst_rng)
self.index = self.index.append(dst_rng)
- self.index = self.index.append(date_range(start='10/29/2000 2:00:00',
- end='10/29/2000 3:00:00',
- freq='S'))
+ self.index = self.index.append(
+ date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S")
+ )
def time_infer_dst(self, tz):
- self.index.tz_localize(tz, ambiguous='infer')
+ self.index.tz_localize(tz, ambiguous="infer")
class ResetIndex:
- params = [None, 'US/Eastern']
- param_names = 'tz'
+ params = [None, "US/Eastern"]
+ param_names = "tz"
def setup(self, tz):
- idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz)
+ idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz)
self.df = DataFrame(np.random.randn(1000, 2), index=idx)
def time_reest_datetimeindex(self, tz):
@@ -95,12 +91,12 @@ def time_reest_datetimeindex(self, tz):
class Factorize:
- params = [None, 'Asia/Tokyo']
- param_names = 'tz'
+ params = [None, "Asia/Tokyo"]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz)
+ self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz)
self.dti = self.dti.repeat(5)
def time_factorize(self, tz):
@@ -109,25 +105,24 @@ def time_factorize(self, tz):
class InferFreq:
- params = [None, 'D', 'B']
- param_names = ['freq']
+ params = [None, "D", "B"]
+ param_names = ["freq"]
def setup(self, freq):
if freq is None:
- self.idx = date_range(start='1/1/1700', freq='D', periods=10000)
+ self.idx = date_range(start="1/1/1700", freq="D", periods=10000)
self.idx.freq = None
else:
- self.idx = date_range(start='1/1/1700', freq=freq, periods=10000)
+ self.idx = date_range(start="1/1/1700", freq=freq, periods=10000)
def time_infer_freq(self, freq):
infer_freq(self.idx)
class TimeDatetimeConverter:
-
def setup(self):
N = 100000
- self.rng = date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = date_range(start="1/1/2000", periods=N, freq="T")
def time_convert(self):
DatetimeConverter.convert(self.rng, None, None)
@@ -136,11 +131,11 @@ def time_convert(self):
class Iteration:
params = [date_range, period_range]
- param_names = ['time_index']
+ param_names = ["time_index"]
def setup(self, time_index):
- N = 10**6
- self.idx = time_index(start='20140101', freq='T', periods=N)
+ N = 10 ** 6
+ self.idx = time_index(start="20140101", freq="T", periods=N)
self.exit = 10000
def time_iter(self, time_index):
@@ -155,13 +150,13 @@ def time_iter_preexit(self, time_index):
class ResampleDataFrame:
- params = ['max', 'mean', 'min']
- param_names = ['method']
+ params = ["max", "mean", "min"]
+ param_names = ["method"]
def setup(self, method):
- rng = date_range(start='20130101', periods=100000, freq='50L')
+ rng = date_range(start="20130101", periods=100000, freq="50L")
df = DataFrame(np.random.randn(100000, 2), index=rng)
- self.resample = getattr(df.resample('1s'), method)
+ self.resample = getattr(df.resample("1s"), method)
def time_method(self, method):
self.resample()
@@ -169,16 +164,14 @@ def time_method(self, method):
class ResampleSeries:
- params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc'])
- param_names = ['index', 'freq', 'method']
+ params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"])
+ param_names = ["index", "freq", "method"]
def setup(self, index, freq, method):
- indexes = {'period': period_range(start='1/1/2000',
- end='1/1/2001',
- freq='T'),
- 'datetime': date_range(start='1/1/2000',
- end='1/1/2001',
- freq='T')}
+ indexes = {
+ "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ }
idx = indexes[index]
ts = Series(np.random.randn(len(idx)), index=idx)
self.resample = getattr(ts.resample(freq), method)
@@ -190,32 +183,35 @@ def time_resample(self, index, freq, method):
class ResampleDatetetime64:
# GH 7754
def setup(self):
- rng3 = date_range(start='2000-01-01 00:00:00',
- end='2000-01-01 10:00:00', freq='555000U')
- self.dt_ts = Series(5, rng3, dtype='datetime64[ns]')
+ rng3 = date_range(
+ start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U"
+ )
+ self.dt_ts = Series(5, rng3, dtype="datetime64[ns]")
def time_resample(self):
- self.dt_ts.resample('1S').last()
+ self.dt_ts.resample("1S").last()
class AsOf:
- params = ['DataFrame', 'Series']
- param_names = ['constructor']
+ params = ["DataFrame", "Series"]
+ param_names = ["constructor"]
def setup(self, constructor):
N = 10000
M = 10
- rng = date_range(start='1/1/1990', periods=N, freq='53s')
- data = {'DataFrame': DataFrame(np.random.randn(N, M)),
- 'Series': Series(np.random.randn(N))}
+ rng = date_range(start="1/1/1990", periods=N, freq="53s")
+ data = {
+ "DataFrame": DataFrame(np.random.randn(N, M)),
+ "Series": Series(np.random.randn(N)),
+ }
self.ts = data[constructor]
self.ts.index = rng
self.ts2 = self.ts.copy()
self.ts2.iloc[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3.iloc[-5000:] = np.nan
- self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s')
+ self.dates = date_range(start="1/1/1990", periods=N * 10, freq="5s")
self.date = self.dates[0]
self.date_last = self.dates[-1]
self.date_early = self.date - timedelta(10)
@@ -248,11 +244,11 @@ def time_asof_nan_single(self, constructor):
class SortIndex:
params = [True, False]
- param_names = ['monotonic']
+ param_names = ["monotonic"]
def setup(self, monotonic):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
self.s = Series(np.random.randn(N), index=idx)
if not monotonic:
self.s = self.s.sample(frac=1)
@@ -265,10 +261,9 @@ def time_get_slice(self, monotonic):
class IrregularOps:
-
def setup(self):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
s = Series(np.random.randn(N), index=idx)
self.left = s.sample(frac=1)
self.right = s.sample(frac=1)
@@ -278,10 +273,9 @@ def time_add(self):
class Lookup:
-
def setup(self):
N = 1500000
- rng = date_range(start='1/1/2000', periods=N, freq='S')
+ rng = date_range(start="1/1/2000", periods=N, freq="S")
self.ts = Series(1, index=rng)
self.lookup_val = rng[N // 2]
@@ -291,23 +285,35 @@ def time_lookup_and_cleanup(self):
class ToDatetimeYYYYMMDD:
-
def setup(self):
- rng = date_range(start='1/1/2000', periods=10000, freq='D')
- self.stringsD = Series(rng.strftime('%Y%m%d'))
+ rng = date_range(start="1/1/2000", periods=10000, freq="D")
+ self.stringsD = Series(rng.strftime("%Y%m%d"))
def time_format_YYYYMMDD(self):
- to_datetime(self.stringsD, format='%Y%m%d')
+ to_datetime(self.stringsD, format="%Y%m%d")
-class ToDatetimeISO8601:
+class ToDatetimeCacheSmallCount:
+ params = ([True, False], [50, 500, 5000, 100000])
+ param_names = ["cache", "count"]
+
+ def setup(self, cache, count):
+ rng = date_range(start="1/1/1971", periods=count)
+ self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist()
+
+ def time_unique_date_strings(self, cache, count):
+ to_datetime(self.unique_date_strings, cache=cache)
+
+
+class ToDatetimeISO8601:
def setup(self):
- rng = date_range(start='1/1/2000', periods=20000, freq='H')
- self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
- self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist()
- self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
- for x in rng]
+ rng = date_range(start="1/1/2000", periods=20000, freq="H")
+ self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
+ self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
+ self.strings_tz_space = [
+ x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
+ ]
def time_iso8601(self):
to_datetime(self.strings)
@@ -316,22 +322,21 @@ def time_iso8601_nosep(self):
to_datetime(self.strings_nosep)
def time_iso8601_format(self):
- to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')
+ to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S")
def time_iso8601_format_no_sep(self):
- to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')
+ to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S")
def time_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)
class ToDatetimeNONISO8601:
-
def setup(self):
N = 10000
half = int(N / 2)
- ts_string_1 = 'March 1, 2018 12:00:00+0400'
- ts_string_2 = 'March 1, 2018 12:00:00+0500'
+ ts_string_1 = "March 1, 2018 12:00:00+0400"
+ ts_string_2 = "March 1, 2018 12:00:00+0500"
self.same_offset = [ts_string_1] * N
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half
@@ -343,50 +348,48 @@ def time_different_offset(self):
class ToDatetimeFormatQuarters:
-
def setup(self):
- self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000)
+ self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000)
def time_infer_quarter(self):
to_datetime(self.s)
class ToDatetimeFormat:
-
def setup(self):
- self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000)
- self.s2 = self.s.str.replace(':\\S+$', '')
+ self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000)
+ self.s2 = self.s.str.replace(":\\S+$", "")
def time_exact(self):
- to_datetime(self.s2, format='%d%b%y')
+ to_datetime(self.s2, format="%d%b%y")
def time_no_exact(self):
- to_datetime(self.s, format='%d%b%y', exact=False)
+ to_datetime(self.s, format="%d%b%y", exact=False)
class ToDatetimeCache:
params = [True, False]
- param_names = ['cache']
+ param_names = ["cache"]
def setup(self, cache):
N = 10000
self.unique_numeric_seconds = list(range(N))
self.dup_numeric_seconds = [1000] * N
- self.dup_string_dates = ['2000-02-11'] * N
- self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N
+ self.dup_string_dates = ["2000-02-11"] * N
+ self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N
def time_unique_seconds_and_unit(self, cache):
- to_datetime(self.unique_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.unique_numeric_seconds, unit="s", cache=cache)
def time_dup_seconds_and_unit(self, cache):
- to_datetime(self.dup_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.dup_numeric_seconds, unit="s", cache=cache)
def time_dup_string_dates(self, cache):
to_datetime(self.dup_string_dates, cache=cache)
def time_dup_string_dates_and_format(self, cache):
- to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache)
+ to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache)
def time_dup_string_tzoffset_dates(self, cache):
to_datetime(self.dup_string_with_tz, cache=cache)
@@ -394,14 +397,12 @@ def time_dup_string_tzoffset_dates(self, cache):
class DatetimeAccessor:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.series = Series(
- date_range(start='1/1/2000', periods=N, freq='T', tz=tz)
- )
+ self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz))
def time_dt_accessor(self, tz):
self.series.dt
diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py
index c6e56804c7b21..8ebb2d8d2f35d 100644
--- a/asv_bench/benchmarks/timestamp.py
+++ b/asv_bench/benchmarks/timestamp.py
@@ -7,21 +7,20 @@
class TimestampConstruction:
-
def time_parse_iso8601_no_tz(self):
- Timestamp('2017-08-25 08:16:14')
+ Timestamp("2017-08-25 08:16:14")
def time_parse_iso8601_tz(self):
- Timestamp('2017-08-25 08:16:14-0500')
+ Timestamp("2017-08-25 08:16:14-0500")
def time_parse_dateutil(self):
- Timestamp('2017/08/25 08:16:14 AM')
+ Timestamp("2017/08/25 08:16:14 AM")
def time_parse_today(self):
- Timestamp('today')
+ Timestamp("today")
def time_parse_now(self):
- Timestamp('now')
+ Timestamp("now")
def time_fromordinal(self):
Timestamp.fromordinal(730120)
@@ -31,14 +30,13 @@ def time_fromtimestamp(self):
class TimestampProperties:
- _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC,
- dateutil.tz.tzutc()]
- _freqs = [None, 'B']
+ _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()]
+ _freqs = [None, "B"]
params = [_tzs, _freqs]
- param_names = ['tz', 'freq']
+ param_names = ["tz", "freq"]
def setup(self, tz, freq):
- self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq)
+ self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq)
def time_tz(self, tz, freq):
self.ts.tz
@@ -93,15 +91,14 @@ def time_month_name(self, tz, freq):
class TimestampOps:
- params = [None, 'US/Eastern', pytz.UTC,
- dateutil.tz.tzutc()]
- param_names = ['tz']
+ params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()]
+ param_names = ["tz"]
def setup(self, tz):
- self.ts = Timestamp('2017-08-25 08:16:14', tz=tz)
+ self.ts = Timestamp("2017-08-25 08:16:14", tz=tz)
def time_replace_tz(self, tz):
- self.ts.replace(tzinfo=pytz.timezone('US/Eastern'))
+ self.ts.replace(tzinfo=pytz.timezone("US/Eastern"))
def time_replace_None(self, tz):
self.ts.replace(tzinfo=None)
@@ -124,16 +121,16 @@ def time_to_julian_date(self, tz):
self.ts.to_julian_date()
def time_floor(self, tz):
- self.ts.floor('5T')
+ self.ts.floor("5T")
def time_ceil(self, tz):
- self.ts.ceil('5T')
+ self.ts.ceil("5T")
class TimestampAcrossDst:
def setup(self):
dt = datetime.datetime(2016, 3, 27, 1)
- self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo
+ self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo
self.ts2 = Timestamp(dt)
def time_replace_across_dst(self):
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 85325c52e7e6d..263a87176a9c9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -5,6 +5,7 @@ jobs:
parameters:
name: macOS
vmImage: xcode9-macos10.13
+
- template: ci/azure/posix.yml
parameters:
name: Linux
@@ -21,23 +22,17 @@ jobs:
timeoutInMinutes: 90
steps:
- script: |
- # XXX next command should avoid redefining the path in every step, but
- # made the process crash as it couldn't find deactivate
- #echo '##vso[task.prependpath]$HOME/miniconda3/bin'
- echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev'
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
echo '##vso[task.setvariable variable=ENV_FILE]environment.yml'
echo '##vso[task.setvariable variable=AZURE]true'
displayName: 'Setting environment variables'
# Do not require a conda environment
- - script: |
- export PATH=$HOME/miniconda3/bin:$PATH
- ci/code_checks.sh patterns
+ - script: ci/code_checks.sh patterns
displayName: 'Looking for unwanted patterns'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
sudo apt-get install -y libc6-dev-i386
ci/setup_env.sh
displayName: 'Setup environment and build pandas'
@@ -45,14 +40,12 @@ jobs:
# Do not require pandas
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh lint
displayName: 'Linting'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh dependencies
displayName: 'Dependencies consistency'
@@ -60,42 +53,36 @@ jobs:
# Require pandas
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh code
displayName: 'Checks on imported code'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh doctests
displayName: 'Running doctests'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh docstrings
displayName: 'Docstring validation'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh typing
displayName: 'Typing validation'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
pytest --capture=no --strict scripts
- displayName: 'Testing docstring validaton script'
+ displayName: 'Testing docstring validation script'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
cd asv_bench
asv check -E existing
@@ -116,3 +103,67 @@ jobs:
fi
displayName: 'Running benchmarks'
condition: true
+
+- job: 'Docs'
+ pool:
+ vmImage: ubuntu-16.04
+ timeoutInMinutes: 90
+ steps:
+ - script: |
+ echo '##vso[task.setvariable variable=ENV_FILE]environment.yml'
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
+ displayName: 'Setting environment variables'
+
+ - script: |
+ sudo apt-get install -y libc6-dev-i386
+ ci/setup_env.sh
+ displayName: 'Setup environment and build pandas'
+
+ - script: |
+ source activate pandas-dev
+ # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547)
+ doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]}
+ grep -B1 "^<<<-------------------------------------------------------------------------$" sphinx.log ; IPY_RET=$(( $? != 1 ))
+ exit $(( $SPHINX_RET + $IPY_RET ))
+ displayName: 'Build documentation'
+
+ - script: |
+ cd doc/build/html
+ git init
+ touch .nojekyll
+ echo "dev.pandas.io" > CNAME
+ printf "User-agent: *\nDisallow: /" > robots.txt
+ git add --all .
+ git config user.email "pandas-dev@python.org"
+ git config user.name "pandas-docs-bot"
+ git commit -m "pandas documentation in master"
+ displayName: 'Create git repo for docs build'
+ condition : |
+ and(not(eq(variables['Build.Reason'], 'PullRequest')),
+ eq(variables['Build.SourceBranch'], 'refs/heads/master'))
+
+ # For `InstallSSHKey@0` to work, next steps are required:
+ # 1. Generate a pair of private/public keys (i.e. `ssh-keygen -t rsa -b 4096 -C "your_email@example.com"`)
+ # 2. Go to "Library > Secure files" in the Azure Pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles
+ # 3. Click on "+ Secure file"
+ # 4. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key")
+ # 5. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save
+ # 6. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be set as a deploy key of the repo where the docs will be pushed (with write access): https://github.com/pandas-dev/pandas-dev.github.io/settings/keys
+ - task: InstallSSHKey@0
+ inputs:
+ hostName: 'github.com,192.30.252.128 ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ=='
+ sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org'
+ sshKeySecureFile: 'pandas_docs_key'
+ displayName: 'Install GitHub ssh deployment key'
+ condition : |
+ and(not(eq(variables['Build.Reason'], 'PullRequest')),
+ eq(variables['Build.SourceBranch'], 'refs/heads/master'))
+
+ - script: |
+ cd doc/build/html
+ git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git
+ git push -f origin master
+ displayName: 'Publish docs to GitHub pages'
+ condition : |
+ and(not(eq(variables['Build.Reason'], 'PullRequest')),
+ eq(variables['Build.SourceBranch'], 'refs/heads/master'))
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
index f53e284c221c6..6093df46ffb60 100644
--- a/ci/azure/posix.yml
+++ b/ci/azure/posix.yml
@@ -33,6 +33,12 @@ jobs:
PATTERN: "not slow and not network"
LOCALE_OVERRIDE: "it_IT.UTF-8"
+ py36_32bit:
+ ENV_FILE: ci/deps/azure-36-32bit.yaml
+ CONDA_PY: "36"
+ PATTERN: "not slow and not network"
+ BITS32: "yes"
+
py37_locale:
ENV_FILE: ci/deps/azure-37-locale.yaml
CONDA_PY: "37"
@@ -50,17 +56,15 @@ jobs:
steps:
- script: |
if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
echo "Creating Environment"
ci/setup_env.sh
displayName: 'Setup environment and build pandas'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/run_tests.sh
displayName: 'Test'
- - script: |
- export PATH=$HOME/miniconda3/bin:$PATH
- source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
+ - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
- task: PublishTestResults@2
inputs:
testResultsFiles: 'test-data-*.xml'
@@ -89,4 +93,8 @@ jobs:
# note that this will produce $LASTEXITCODE=1
Write-Error "$($matches[1]) tests failed"
}
- displayName: Check for test failures
+ displayName: 'Check for test failures'
+ - script: |
+ source activate pandas-dev
+ python ci/print_skipped.py
+ displayName: 'Print skipped tests'
diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml
index eeb03a0b28130..dfa82819b9826 100644
--- a/ci/azure/windows.yml
+++ b/ci/azure/windows.yml
@@ -17,12 +17,16 @@ jobs:
CONDA_PY: "37"
steps:
- - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
- displayName: Add conda to PATH
+ - powershell: |
+ Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
+ Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin"
+ displayName: 'Add conda to PATH'
- script: conda update -q -n base conda
displayName: Update conda
- - script: conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml
- displayName: Create anaconda environment
+ - script: |
+ call activate
+ conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml
+ displayName: 'Create anaconda environment'
- script: |
call activate pandas-dev
call conda list
@@ -48,4 +52,8 @@ jobs:
# note that this will produce $LASTEXITCODE=1
Write-Error "$($matches[1]) tests failed"
}
- displayName: Check for test failures
+ displayName: 'Check for test failures'
+ - script: |
+ source activate pandas-dev
+ python ci/print_skipped.py
+ displayName: 'Print skipped tests'
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
deleted file mode 100755
index bf22f0764144c..0000000000000
--- a/ci/build_docs.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-set -e
-
-if [ "${TRAVIS_OS_NAME}" != "linux" ]; then
- echo "not doing build_docs on non-linux"
- exit 0
-fi
-
-cd "$TRAVIS_BUILD_DIR"/doc
-echo "inside $0"
-
-if [ "$DOC" ]; then
-
- echo "Will build docs"
-
- echo ###############################
- echo # Log file for the doc build #
- echo ###############################
-
- echo ./make.py
- ./make.py
-
- echo ########################
- echo # Create and send docs #
- echo ########################
-
- echo "Only uploading docs when TRAVIS_PULL_REQUEST is 'false'"
- echo "TRAVIS_PULL_REQUEST: ${TRAVIS_PULL_REQUEST}"
-
- if [ "${TRAVIS_PULL_REQUEST}" == "false" ]; then
- cd build/html
- git config --global user.email "pandas-docs-bot@localhost.foo"
- git config --global user.name "pandas-docs-bot"
-
- # create the repo
- git init
-
- touch README
- git add README
- git commit -m "Initial commit" --allow-empty
- git branch gh-pages
- git checkout gh-pages
- touch .nojekyll
- git add --all .
- git commit -m "Version" --allow-empty
-
- git remote add origin "https://${PANDAS_GH_TOKEN}@github.com/pandas-dev/pandas-docs-travis.git"
- git fetch origin
- git remote -v
-
- git push origin gh-pages -f
- fi
-fi
-
-exit 0
diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh
new file mode 100755
index 0000000000000..9dbcd4f98683e
--- /dev/null
+++ b/ci/check_git_tags.sh
@@ -0,0 +1,28 @@
+set -e
+
+if [[ ! $(git tag) ]]; then
+ echo "No git tags in clone, please sync your git tags with upstream using:"
+ echo " git fetch --tags upstream"
+ echo " git push --tags origin"
+ echo ""
+ echo "If the issue persists, the clone depth needs to be increased in .travis.yml"
+ exit 1
+fi
+
+# This will error if there are no tags and we omit --always
+DESCRIPTION=$(git describe --long --tags)
+echo "$DESCRIPTION"
+
+if [[ "$DESCRIPTION" == *"untagged"* ]]; then
+ echo "Unable to determine most recent tag, aborting build"
+ exit 1
+else
+ if [[ "$DESCRIPTION" != *"g"* ]]; then
+ # A good description will have the hash prefixed by g, a bad one will be
+ # just the hash
+ echo "Unable to determine most recent tag, aborting build"
+ exit 1
+ else
+ echo "$(git tag)"
+ fi
+fi
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index a16580679ff54..333136ddfddd9 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -52,6 +52,13 @@ fi
### LINTING ###
if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
+ echo "black --version"
+ black --version
+
+ MSG='Checking black formatting' ; echo $MSG
+ black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)'
+ RET=$(($RET + $?)) ; echo $MSG "DONE"
+
# `setup.cfg` contains the list of error codes that are being ignored in flake8
echo "flake8 --version"
@@ -149,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
- invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
+ invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG
@@ -245,10 +252,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Doctests interval classes' ; echo $MSG
- pytest --doctest-modules -v \
+ pytest -q --doctest-modules \
pandas/core/indexes/interval.py \
pandas/core/arrays/interval.py \
- -k"-from_arrays -from_breaks -from_intervals -from_tuples -get_loc -set_closed -to_tuples -interval_range"
+ -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
@@ -256,8 +263,8 @@ fi
### DOCSTRINGS ###
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
- MSG='Validate docstrings (GL03, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
- $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
+ MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
+ $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml
index e55a4fbdf3fa9..97c45b2be27d7 100644
--- a/ci/deps/azure-35-compat.yaml
+++ b/ci/deps/azure-35-compat.yaml
@@ -3,26 +3,26 @@ channels:
- defaults
- conda-forge
dependencies:
- - beautifulsoup4=4.4.1
+ - beautifulsoup4=4.6.0
- bottleneck=1.2.1
- jinja2=2.8
- numexpr=2.6.2
- numpy=1.13.3
- - openpyxl=2.4.0
+ - openpyxl=2.4.8
- pytables=3.4.2
- - python-dateutil=2.5.0
- - python=3.5.*
- - pytz=2015.4
+ - python-dateutil=2.6.1
+ - python=3.5.3
+ - pytz=2017.2
- scipy=0.19.0
- - xlrd=1.0.0
- - xlsxwriter=0.7.7
- - xlwt=1.0.0
+ - xlrd=1.1.0
+ - xlsxwriter=0.9.8
+ - xlwt=1.2.0
# universal
- cython=0.28.2
- hypothesis>=3.58.0
- pytest-xdist
- pytest-mock
- - isort
+ - pytest-azurepipelines
- pip
- pip:
# for python 3.5, pytest>=4.0.2 is not available in conda
diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml
new file mode 100644
index 0000000000000..43bf0ecdd6c3e
--- /dev/null
+++ b/ci/deps/azure-36-32bit.yaml
@@ -0,0 +1,20 @@
+name: pandas-dev
+channels:
+ - defaults
+ - conda-forge
+dependencies:
+ - gcc_linux-32
+ - gcc_linux-32
+ - gxx_linux-32
+ - cython=0.28.2
+ - numpy=1.14.*
+ - python-dateutil
+ - python=3.6.*
+ - pytz=2017.2
+ # universal
+ - pytest>=4.0.2,<5.0.0
+ - pytest-xdist
+ - pytest-mock
+ - pytest-azurepipelines
+ - hypothesis>=3.58.0
+ - pip
diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml
index f43ed0249985e..6a77b5dbedc61 100644
--- a/ci/deps/azure-36-locale.yaml
+++ b/ci/deps/azure-36-locale.yaml
@@ -3,28 +3,28 @@ channels:
- defaults
- conda-forge
dependencies:
- - beautifulsoup4==4.5.1
+ - beautifulsoup4==4.6.0
- bottleneck=1.2.*
- cython=0.28.2
- lxml
- matplotlib=2.2.2
- numpy=1.14.*
- - openpyxl=2.4.0
+ - openpyxl=2.4.8
- python-dateutil
- python-blosc
- python=3.6.*
- - pytz=2016.10
+ - pytz=2017.2
- scipy
- sqlalchemy=1.1.4
- - xlrd=1.0.0
- - xlsxwriter=0.9.4
+ - xlrd=1.1.0
+ - xlsxwriter=0.9.8
- xlwt=1.2.0
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
+ - pytest-azurepipelines
- hypothesis>=3.58.0
- - isort
- pip
- pip:
- html5lib==1.0b2
diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml
index 2a0404614dcfc..2bf2bd74795d2 100644
--- a/ci/deps/azure-36-locale_slow.yaml
+++ b/ci/deps/azure-36-locale_slow.yaml
@@ -29,8 +29,8 @@ dependencies:
- pytest>=4.0.2
- pytest-xdist
- pytest-mock
+ - pytest-azurepipelines
- moto
- - isort
- pip
- pip:
- hypothesis>=3.58.0
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
index 649f5892f174d..26dcd213bbfa0 100644
--- a/ci/deps/azure-37-locale.yaml
+++ b/ci/deps/azure-37-locale.yaml
@@ -10,6 +10,7 @@ dependencies:
- jinja2
- lxml
- matplotlib
+ - moto
- nomkl
- numexpr
- numpy
@@ -25,11 +26,10 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
- - isort
+ - pytest-azurepipelines
- pip
- pip:
- hypothesis>=3.58.0
- - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed
diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml
index 6848b9990b46e..65c92ec1dcf0d 100644
--- a/ci/deps/azure-37-numpydev.yaml
+++ b/ci/deps/azure-37-numpydev.yaml
@@ -6,11 +6,11 @@ dependencies:
- pytz
- Cython>=0.28.2
# universal
- - pytest>=4.0.2
+ # pytest < 5 until defaults has pytest-xdist>=1.29.0
+ - pytest>=4.0.2,<5.0
- pytest-xdist
- pytest-mock
- hypothesis>=3.58.0
- - isort
- pip
- pip:
- "git+git://github.com/dateutil/dateutil.git"
@@ -18,3 +18,5 @@ dependencies:
- "--pre"
- "numpy"
- "scipy"
+ # https://github.com/pandas-dev/pandas/issues/27421
+ - pytest-azurepipelines<1.0.0
diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
index 00c2051f29760..39315b15a018b 100644
--- a/ci/deps/azure-macos-35.yaml
+++ b/ci/deps/azure-macos-35.yaml
@@ -16,16 +16,20 @@ dependencies:
- pyarrow
- pytables
- python=3.5.*
+ - python-dateutil==2.6.1
- pytz
- xarray
- xlrd
- xlsxwriter
- xlwt
- - isort
+ - pip
- pip:
- - python-dateutil==2.5.3
+ - pyreadstat
# universal
- - pytest==4.5.0
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
- hypothesis>=3.58.0
+ # https://github.com/pandas-dev/pandas/issues/27421
+ - pytest-azurepipelines<1.0.0
+
diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml
index 7b3ae259fb8dd..ff9264a36cb12 100644
--- a/ci/deps/azure-windows-36.yaml
+++ b/ci/deps/azure-windows-36.yaml
@@ -23,8 +23,8 @@ dependencies:
- xlwt
# universal
- cython>=0.28.2
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
+ - pytest-azurepipelines
- hypothesis>=3.58.0
- - isort
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index 5384e794d442a..075234a937035 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -10,6 +10,7 @@ dependencies:
- jinja2
- lxml
- matplotlib=2.2.*
+ - moto
- numexpr
- numpy=1.14.*
- openpyxl
@@ -25,9 +26,9 @@ dependencies:
- xlwt
# universal
- cython>=0.28.2
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- - moto
+ - pytest-azurepipelines
- hypothesis>=3.58.0
- - isort
+ - pyreadstat
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
index afd9877afdf84..19002cbb8575e 100644
--- a/ci/deps/travis-36-cov.yaml
+++ b/ci/deps/travis-36-cov.yaml
@@ -12,9 +12,11 @@ dependencies:
- geopandas
- html5lib
- matplotlib
+ - moto
- nomkl
- numexpr
- numpy=1.15.*
+ - odfpy
- openpyxl
- pandas-gbq
# https://github.com/pydata/pandas-gbq/issues/271
@@ -37,16 +39,14 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-cov
- pytest-mock
- hypothesis>=3.58.0
- - isort
- pip
- pip:
- brotlipy
- coverage
- - moto
- pandas-datareader
- python-dateutil
diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml
deleted file mode 100644
index 8f7556e6b47ad..0000000000000
--- a/ci/deps/travis-36-doc.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: pandas-dev
-channels:
- - defaults
- - conda-forge
-dependencies:
- - beautifulsoup4
- - bottleneck
- - cython>=0.28.2
- - fastparquet>=0.2.1
- - gitpython
- - html5lib
- - hypothesis>=3.58.0
- - ipykernel
- - ipython
- - ipywidgets
- - lxml
- - matplotlib
- - nbconvert>=5.4.1
- - nbformat
- - nbsphinx
- - notebook>=5.7.5
- - numexpr
- - numpy
- - numpydoc
- - openpyxl
- - pandoc
- - pyarrow
- - pyqt
- - pytables
- - python-dateutil
- - python-snappy
- - python=3.6.*
- - pytz
- - scipy
- - seaborn
- # recursion error with sphinx 2.1.0. https://github.com/pandas-dev/pandas/issues/26723
- - sphinx==2.0.1
- - sqlalchemy
- - statsmodels
- - xarray
- - xlrd
- - xlsxwriter
- - xlwt
- # universal
- - pytest>=4.0.2
- - pytest-xdist
- - isort
diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml
index 1d219bbcb671c..7da4abb9283df 100644
--- a/ci/deps/travis-36-locale.yaml
+++ b/ci/deps/travis-36-locale.yaml
@@ -8,38 +8,35 @@ dependencies:
- python-blosc
- cython>=0.28.2
- fastparquet=0.2.1
- - gcsfs=0.1.0
+ - gcsfs=0.2.2
- html5lib
- ipython
- jinja2
- - lxml=3.7.0
- - matplotlib=3.0.0
+ - lxml=3.8.0
+ - matplotlib=3.0.*
+ - moto
- nomkl
- numexpr
- numpy
- openpyxl
- pandas-gbq=0.8.0
- psycopg2=2.6.2
- - pymysql=0.7.9
+ - pymysql=0.7.11
- pytables
- python-dateutil
- # cannot go past python=3.6.6 for matplotlib=3.0.0 due to
- # https://github.com/matplotlib/matplotlib/issues/12626
- - python=3.6.6
+ - python=3.6.*
- pytz
- s3fs=0.0.8
- scipy
- sqlalchemy=1.1.4
- - xarray=0.8.2
+ - xarray=0.10
- xlrd
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
- - moto
- - isort
- pip
- pip:
- hypothesis>=3.58.0
diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml
index 365c78c02f4d4..9564bf5bb3a9f 100644
--- a/ci/deps/travis-36-slow.yaml
+++ b/ci/deps/travis-36-slow.yaml
@@ -25,9 +25,8 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- moto
- hypothesis>=3.58.0
- - isort
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
index 4bf7d516711f7..9e08c41a3d9c0 100644
--- a/ci/deps/travis-37.yaml
+++ b/ci/deps/travis-37.yaml
@@ -13,12 +13,12 @@ dependencies:
- pyarrow
- pytz
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- hypothesis>=3.58.0
- s3fs
- - isort
- pip
+ - pyreadstat
- pip:
- moto
diff --git a/ci/print_skipped.py b/ci/print_skipped.py
index 67bc7b556cd43..a44281044e11d 100755
--- a/ci/print_skipped.py
+++ b/ci/print_skipped.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
+import os
import sys
import math
import xml.etree.ElementTree as et
@@ -10,45 +11,42 @@ def parse_results(filename):
root = tree.getroot()
skipped = []
- current_class = ''
+ current_class = ""
i = 1
assert i - 1 == len(skipped)
- for el in root.findall('testcase'):
- cn = el.attrib['classname']
- for sk in el.findall('skipped'):
+ for el in root.findall("testcase"):
+ cn = el.attrib["classname"]
+ for sk in el.findall("skipped"):
old_class = current_class
current_class = cn
- name = '{classname}.{name}'.format(classname=current_class,
- name=el.attrib['name'])
- msg = sk.attrib['message']
- out = ''
+ name = "{classname}.{name}".format(
+ classname=current_class, name=el.attrib["name"]
+ )
+ msg = sk.attrib["message"]
+ out = ""
if old_class != current_class:
ndigits = int(math.log(i, 10) + 1)
# 4 for : + space + # + space
- out += ('-' * (len(name + msg) + 4 + ndigits) + '\n')
- out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg)
+ out += "-" * (len(name + msg) + 4 + ndigits) + "\n"
+ out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg)
skipped.append(out)
i += 1
assert i - 1 == len(skipped)
assert i - 1 == len(skipped)
# assert len(skipped) == int(root.attrib['skip'])
- return '\n'.join(skipped)
+ return "\n".join(skipped)
-def main(args):
- print('SKIPPED TESTS:')
- for fn in args.filename:
- print(parse_results(fn))
- return 0
-
+def main():
+ test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"]
-def parse_args():
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument('filename', nargs='+', help='XUnit file to parse')
- return parser.parse_args()
+ print("SKIPPED TESTS:")
+ for fn in test_files:
+ if os.path.isfile(fn):
+ print(parse_results(fn))
+ return 0
-if __name__ == '__main__':
- sys.exit(main(parse_args()))
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index ee46da9f52eab..27d3fcb4cf563 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -50,9 +50,10 @@ do
# if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code
sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret"
- if [[ "$COVERAGE" && $? == 0 ]]; then
- echo "uploading coverage for $TYPE tests"
- echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
- bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
- fi
+ # 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602
+ # if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
+ # echo "uploading coverage for $TYPE tests"
+ # echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
+ # bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
+ # fi
done
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
index 8f73bb228e2bd..88742e0483c7e 100755
--- a/ci/setup_env.sh
+++ b/ci/setup_env.sh
@@ -94,6 +94,12 @@ echo
echo "conda env create -q --file=${ENV_FILE}"
time conda env create -q --file="${ENV_FILE}"
+
+if [[ "$BITS32" == "yes" ]]; then
+ # activate 32-bit compiler
+ export CONDA_BUILD=1
+fi
+
echo "activate pandas-dev"
source activate pandas-dev
diff --git a/codecov.yml b/codecov.yml
index 512bc2e82a736..1644bf315e0ac 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,13 +1,13 @@
codecov:
branch: master
+comment: off
+
coverage:
status:
project:
default:
- enabled: no
target: '82'
patch:
default:
- enabled: no
target: '50'
diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py
index c3647f0c7d2a8..5a07b094e6ad3 100644
--- a/doc/logo/pandas_logo.py
+++ b/doc/logo/pandas_logo.py
@@ -4,7 +4,7 @@
from matplotlib import rcParams
import numpy as np
-rcParams['mathtext.fontset'] = 'cm'
+rcParams["mathtext.fontset"] = "cm"
def fnx():
@@ -37,8 +37,12 @@ def fnx():
plt.figtext(0.05, 0.5, "pandas", size=40)
plt.figtext(
- 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
- size=16, color="#5a89a4")
-
-fig.savefig('pandas_logo.svg')
-fig.savefig('pandas_logo.png')
+ 0.05,
+ 0.2,
+ r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
+ size=16,
+ color="#5a89a4",
+)
+
+fig.savefig("pandas_logo.svg")
+fig.savefig("pandas_logo.png")
diff --git a/doc/make.py b/doc/make.py
index 6ffbd3ef86e68..48febef20fbe6 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -24,9 +24,9 @@
DOC_PATH = os.path.dirname(os.path.abspath(__file__))
-SOURCE_PATH = os.path.join(DOC_PATH, 'source')
-BUILD_PATH = os.path.join(DOC_PATH, 'build')
-REDIRECTS_FILE = os.path.join(DOC_PATH, 'redirects.csv')
+SOURCE_PATH = os.path.join(DOC_PATH, "source")
+BUILD_PATH = os.path.join(DOC_PATH, "build")
+REDIRECTS_FILE = os.path.join(DOC_PATH, "redirects.csv")
class DocBuilder:
@@ -36,8 +36,15 @@ class DocBuilder:
All public methods of this class can be called as parameters of the
script.
"""
- def __init__(self, num_jobs=0, include_api=True, single_doc=None,
- verbosity=0, warnings_are_errors=False):
+
+ def __init__(
+ self,
+ num_jobs=0,
+ include_api=True,
+ single_doc=None,
+ verbosity=0,
+ warnings_are_errors=False,
+ ):
self.num_jobs = num_jobs
self.verbosity = verbosity
self.warnings_are_errors = warnings_are_errors
@@ -45,16 +52,15 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None,
if single_doc:
single_doc = self._process_single_doc(single_doc)
include_api = False
- os.environ['SPHINX_PATTERN'] = single_doc
+ os.environ["SPHINX_PATTERN"] = single_doc
elif not include_api:
- os.environ['SPHINX_PATTERN'] = '-api'
+ os.environ["SPHINX_PATTERN"] = "-api"
self.single_doc_html = None
- if single_doc and single_doc.endswith('.rst'):
- self.single_doc_html = os.path.splitext(single_doc)[0] + '.html'
+ if single_doc and single_doc.endswith(".rst"):
+ self.single_doc_html = os.path.splitext(single_doc)[0] + ".html"
elif single_doc:
- self.single_doc_html = 'reference/api/pandas.{}.html'.format(
- single_doc)
+ self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc)
def _process_single_doc(self, single_doc):
"""
@@ -66,26 +72,30 @@ def _process_single_doc(self, single_doc):
(e.g. reference/api/pandas.DataFrame.head.rst).
"""
base_name, extension = os.path.splitext(single_doc)
- if extension in ('.rst', '.ipynb'):
+ if extension in (".rst", ".ipynb"):
if os.path.exists(os.path.join(SOURCE_PATH, single_doc)):
return single_doc
else:
- raise FileNotFoundError('File {} not found'.format(single_doc))
+ raise FileNotFoundError("File {} not found".format(single_doc))
- elif single_doc.startswith('pandas.'):
+ elif single_doc.startswith("pandas."):
try:
obj = pandas # noqa: F821
- for name in single_doc.split('.'):
+ for name in single_doc.split("."):
obj = getattr(obj, name)
except AttributeError:
- raise ImportError('Could not import {}'.format(single_doc))
+ raise ImportError("Could not import {}".format(single_doc))
else:
- return single_doc[len('pandas.'):]
+ return single_doc[len("pandas.") :]
else:
- raise ValueError(('--single={} not understood. Value should be a '
- 'valid path to a .rst or .ipynb file, or a '
- 'valid pandas object (e.g. categorical.rst or '
- 'pandas.DataFrame.head)').format(single_doc))
+ raise ValueError(
+ (
+ "--single={} not understood. Value should be a "
+ "valid path to a .rst or .ipynb file, or a "
+ "valid pandas object (e.g. categorical.rst or "
+ "pandas.DataFrame.head)"
+ ).format(single_doc)
+ )
@staticmethod
def _run_os(*args):
@@ -117,52 +127,55 @@ def _sphinx_build(self, kind):
--------
>>> DocBuilder(num_jobs=4)._sphinx_build('html')
"""
- if kind not in ('html', 'latex'):
- raise ValueError('kind must be html or latex, '
- 'not {}'.format(kind))
+ if kind not in ("html", "latex"):
+ raise ValueError("kind must be html or latex, " "not {}".format(kind))
- cmd = ['sphinx-build', '-b', kind]
+ cmd = ["sphinx-build", "-b", kind]
if self.num_jobs:
- cmd += ['-j', str(self.num_jobs)]
+ cmd += ["-j", str(self.num_jobs)]
if self.warnings_are_errors:
- cmd += ['-W', '--keep-going']
+ cmd += ["-W", "--keep-going"]
if self.verbosity:
- cmd.append('-{}'.format('v' * self.verbosity))
- cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'),
- SOURCE_PATH, os.path.join(BUILD_PATH, kind)]
+ cmd.append("-{}".format("v" * self.verbosity))
+ cmd += [
+ "-d",
+ os.path.join(BUILD_PATH, "doctrees"),
+ SOURCE_PATH,
+ os.path.join(BUILD_PATH, kind),
+ ]
return subprocess.call(cmd)
def _open_browser(self, single_doc_html):
"""
Open a browser tab showing single
"""
- url = os.path.join('file://', DOC_PATH, 'build', 'html',
- single_doc_html)
+ url = os.path.join("file://", DOC_PATH, "build", "html", single_doc_html)
webbrowser.open(url, new=2)
def _get_page_title(self, page):
"""
Open the rst file `page` and extract its title.
"""
- fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page))
+ fname = os.path.join(SOURCE_PATH, "{}.rst".format(page))
option_parser = docutils.frontend.OptionParser(
- components=(docutils.parsers.rst.Parser,))
- doc = docutils.utils.new_document(
- '',
- option_parser.get_default_values())
+ components=(docutils.parsers.rst.Parser,)
+ )
+ doc = docutils.utils.new_document("", option_parser.get_default_values())
with open(fname) as f:
data = f.read()
parser = docutils.parsers.rst.Parser()
# do not generate any warning when parsing the rst
- with open(os.devnull, 'a') as f:
+ with open(os.devnull, "a") as f:
doc.reporter.stream = f
parser.parse(data, doc)
- section = next(node for node in doc.children
- if isinstance(node, docutils.nodes.section))
- title = next(node for node in section.children
- if isinstance(node, docutils.nodes.title))
+ section = next(
+ node for node in doc.children if isinstance(node, docutils.nodes.section)
+ )
+ title = next(
+ node for node in section.children if isinstance(node, docutils.nodes.title)
+ )
return title.astext()
@@ -171,7 +184,7 @@ def _add_redirects(self):
Create in the build directory an html file with a redirect,
for every row in REDIRECTS_FILE.
"""
- html = '''
+ html = """
@@ -182,16 +195,14 @@ def _add_redirects(self):
- '''
+ """
with open(REDIRECTS_FILE) as mapping_fd:
reader = csv.reader(mapping_fd)
for row in reader:
- if not row or row[0].strip().startswith('#'):
+ if not row or row[0].strip().startswith("#"):
continue
- path = os.path.join(BUILD_PATH,
- 'html',
- *row[0].split('/')) + '.html'
+ path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html"
try:
title = self._get_page_title(row[1])
@@ -199,51 +210,54 @@ def _add_redirects(self):
# the file can be an ipynb and not an rst, or docutils
# may not be able to read the rst because it has some
# sphinx specific stuff
- title = 'this page'
+ title = "this page"
if os.path.exists(path):
- raise RuntimeError((
- 'Redirection would overwrite an existing file: '
- '{}').format(path))
+ raise RuntimeError(
+ ("Redirection would overwrite an existing file: " "{}").format(
+ path
+ )
+ )
- with open(path, 'w') as moved_page_fd:
+ with open(path, "w") as moved_page_fd:
moved_page_fd.write(
- html.format(url='{}.html'.format(row[1]),
- title=title))
+ html.format(url="{}.html".format(row[1]), title=title)
+ )
def html(self):
"""
Build HTML documentation.
"""
- ret_code = self._sphinx_build('html')
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ ret_code = self._sphinx_build("html")
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
- if self.single_doc_html is not None:
- self._open_browser(self.single_doc_html)
- else:
- self._add_redirects()
+ if ret_code == 0:
+ if self.single_doc_html is not None:
+ self._open_browser(self.single_doc_html)
+ else:
+ self._add_redirects()
return ret_code
def latex(self, force=False):
"""
Build PDF documentation.
"""
- if sys.platform == 'win32':
- sys.stderr.write('latex build has not been tested on windows\n')
+ if sys.platform == "win32":
+ sys.stderr.write("latex build has not been tested on windows\n")
else:
- ret_code = self._sphinx_build('latex')
- os.chdir(os.path.join(BUILD_PATH, 'latex'))
+ ret_code = self._sphinx_build("latex")
+ os.chdir(os.path.join(BUILD_PATH, "latex"))
if force:
for i in range(3):
- self._run_os('pdflatex',
- '-interaction=nonstopmode',
- 'pandas.tex')
- raise SystemExit('You should check the file '
- '"build/latex/pandas.pdf" for problems.')
+ self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex")
+ raise SystemExit(
+ "You should check the file "
+ '"build/latex/pandas.pdf" for problems.'
+ )
else:
- self._run_os('make')
+ self._run_os("make")
return ret_code
def latex_forced(self):
@@ -258,84 +272,101 @@ def clean():
Clean documentation generated files.
"""
shutil.rmtree(BUILD_PATH, ignore_errors=True)
- shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),
- ignore_errors=True)
+ shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True)
def zip_html(self):
"""
Compress HTML documentation into a zip file.
"""
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
- dirname = os.path.join(BUILD_PATH, 'html')
+ dirname = os.path.join(BUILD_PATH, "html")
fnames = os.listdir(dirname)
os.chdir(dirname)
- self._run_os('zip',
- zip_fname,
- '-r',
- '-q',
- *fnames)
+ self._run_os("zip", zip_fname, "-r", "-q", *fnames)
def main():
- cmds = [method for method in dir(DocBuilder) if not method.startswith('_')]
+ cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]
argparser = argparse.ArgumentParser(
- description='pandas documentation builder',
- epilog='Commands: {}'.format(','.join(cmds)))
- argparser.add_argument('command',
- nargs='?',
- default='html',
- help='command to run: {}'.format(', '.join(cmds)))
- argparser.add_argument('--num-jobs',
- type=int,
- default=0,
- help='number of jobs used by sphinx-build')
- argparser.add_argument('--no-api',
- default=False,
- help='omit api and autosummary',
- action='store_true')
- argparser.add_argument('--single',
- metavar='FILENAME',
- type=str,
- default=None,
- help=('filename (relative to the "source" folder)'
- ' of section or method name to compile, e.g. '
- '"development/contributing.rst",'
- ' "ecosystem.rst", "pandas.DataFrame.join"'))
- argparser.add_argument('--python-path',
- type=str,
- default=os.path.dirname(DOC_PATH),
- help='path')
- argparser.add_argument('-v', action='count', dest='verbosity', default=0,
- help=('increase verbosity (can be repeated), '
- 'passed to the sphinx build command'))
- argparser.add_argument('--warnings-are-errors', '-W',
- action='store_true',
- help='fail if warnings are raised')
+ description="pandas documentation builder",
+ epilog="Commands: {}".format(",".join(cmds)),
+ )
+ argparser.add_argument(
+ "command",
+ nargs="?",
+ default="html",
+ help="command to run: {}".format(", ".join(cmds)),
+ )
+ argparser.add_argument(
+ "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build"
+ )
+ argparser.add_argument(
+ "--no-api", default=False, help="omit api and autosummary", action="store_true"
+ )
+ argparser.add_argument(
+ "--single",
+ metavar="FILENAME",
+ type=str,
+ default=None,
+ help=(
+ 'filename (relative to the "source" folder)'
+ " of section or method name to compile, e.g. "
+ '"development/contributing.rst",'
+ ' "ecosystem.rst", "pandas.DataFrame.join"'
+ ),
+ )
+ argparser.add_argument(
+ "--python-path", type=str, default=os.path.dirname(DOC_PATH), help="path"
+ )
+ argparser.add_argument(
+ "-v",
+ action="count",
+ dest="verbosity",
+ default=0,
+ help=(
+ "increase verbosity (can be repeated), "
+ "passed to the sphinx build command"
+ ),
+ )
+ argparser.add_argument(
+ "--warnings-are-errors",
+ "-W",
+ action="store_true",
+ help="fail if warnings are raised",
+ )
args = argparser.parse_args()
if args.command not in cmds:
- raise ValueError('Unknown command {}. Available options: {}'.format(
- args.command, ', '.join(cmds)))
+ raise ValueError(
+ "Unknown command {}. Available options: {}".format(
+ args.command, ", ".join(cmds)
+ )
+ )
# Below we update both os.environ and sys.path. The former is used by
# external libraries (namely Sphinx) to compile this module and resolve
# the import of `python_path` correctly. The latter is used to resolve
# the import within the module, injecting it into the global namespace
- os.environ['PYTHONPATH'] = args.python_path
+ os.environ["PYTHONPATH"] = args.python_path
sys.path.insert(0, args.python_path)
- globals()['pandas'] = importlib.import_module('pandas')
+ globals()["pandas"] = importlib.import_module("pandas")
# Set the matplotlib backend to the non-interactive Agg backend for all
# child processes.
- os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg'
-
- builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
- args.verbosity, args.warnings_are_errors)
+ os.environ["MPLBACKEND"] = "module://matplotlib.backends.backend_agg"
+
+ builder = DocBuilder(
+ args.num_jobs,
+ not args.no_api,
+ args.single,
+ args.verbosity,
+ args.warnings_are_errors,
+ )
return getattr(builder, args.command)()
-if __name__ == '__main__':
+if __name__ == "__main__":
sys.exit(main())
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 971aa04ba866a..a4b7d97c2cf5e 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -18,7 +18,7 @@
import jinja2
from sphinx.ext.autosummary import _import_by_name
from numpydoc.docscrape import NumpyDocString
-from numpydoc.docscrape_sphinx import SphinxDocString
+
logger = logging.getLogger(__name__)
@@ -34,15 +34,13 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.append(os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('../sphinxext'))
-sys.path.extend([
-
- # numpy standard doc extensions
- os.path.join(os.path.dirname(__file__),
- '..', '../..',
- 'sphinxext')
-
-])
+sys.path.insert(0, os.path.abspath("../sphinxext"))
+sys.path.extend(
+ [
+ # numpy standard doc extensions
+ os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext")
+ ]
+)
# -- General configuration -----------------------------------------------
@@ -50,65 +48,69 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
# sphinxext.
-extensions = ['sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.doctest',
- 'sphinx.ext.extlinks',
- 'sphinx.ext.todo',
- 'numpydoc', # handle NumPy documentation formatted docstrings
- 'IPython.sphinxext.ipython_directive',
- 'IPython.sphinxext.ipython_console_highlighting',
- 'matplotlib.sphinxext.plot_directive',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.coverage',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.linkcode',
- 'nbsphinx',
- 'contributors', # custom pandas extension
- ]
-
-exclude_patterns = ['**.ipynb_checkpoints']
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.doctest",
+ "sphinx.ext.extlinks",
+ "sphinx.ext.todo",
+ "numpydoc", # handle NumPy documentation formatted docstrings
+ "IPython.sphinxext.ipython_directive",
+ "IPython.sphinxext.ipython_console_highlighting",
+ "matplotlib.sphinxext.plot_directive",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.coverage",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.linkcode",
+ "nbsphinx",
+ "contributors", # custom pandas extension
+]
+
+exclude_patterns = ["**.ipynb_checkpoints"]
try:
import nbconvert
except ImportError:
- logger.warn('nbconvert not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("nbconvert not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
else:
try:
nbconvert.utils.pandoc.get_pandoc_version()
except nbconvert.utils.pandoc.PandocMissing:
- logger.warn('Pandoc not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("Pandoc not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
# sphinx_pattern can be '-api' to exclude the API pages,
# the path to a file, or a Python object
# (e.g. '10min.rst' or 'pandas.DataFrame.head')
source_path = os.path.dirname(os.path.abspath(__file__))
-pattern = os.environ.get('SPHINX_PATTERN')
+pattern = os.environ.get("SPHINX_PATTERN")
if pattern:
for dirname, dirs, fnames in os.walk(source_path):
for fname in fnames:
- if os.path.splitext(fname)[-1] in ('.rst', '.ipynb'):
- fname = os.path.relpath(os.path.join(dirname, fname),
- source_path)
+ if os.path.splitext(fname)[-1] in (".rst", ".ipynb"):
+ fname = os.path.relpath(os.path.join(dirname, fname), source_path)
- if (fname == 'index.rst'
- and os.path.abspath(dirname) == source_path):
+ if fname == "index.rst" and os.path.abspath(dirname) == source_path:
continue
- elif pattern == '-api' and dirname == 'reference':
+ elif pattern == "-api" and dirname == "reference":
exclude_patterns.append(fname)
- elif pattern != '-api' and fname != pattern:
+ elif pattern != "-api" and fname != pattern:
exclude_patterns.append(fname)
-with open(os.path.join(source_path, 'index.rst.template')) as f:
+with open(os.path.join(source_path, "index.rst.template")) as f:
t = jinja2.Template(f.read())
-with open(os.path.join(source_path, 'index.rst'), 'w') as f:
- f.write(t.render(include_api=pattern is None,
- single_doc=(pattern
- if pattern is not None and pattern != '-api'
- else None)))
-autosummary_generate = True if pattern is None else ['index']
+with open(os.path.join(source_path, "index.rst"), "w") as f:
+ f.write(
+ t.render(
+ include_api=pattern is None,
+ single_doc=(pattern if pattern is not None and pattern != "-api" else None),
+ )
+ )
+autosummary_generate = True if pattern is None else ["index"]
+
+# numpydoc
+numpydoc_attributes_as_param_list = False
# matplotlib plot directive
plot_include_source = True
@@ -119,22 +121,20 @@
import pandas as pd"""
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['../_templates']
+templates_path = ["../_templates"]
# The suffix of source filenames.
-source_suffix = [
- '.rst',
-]
+source_suffix = [".rst"]
# The encoding of source files.
-source_encoding = 'utf-8'
+source_encoding = "utf-8"
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = 'pandas'
-copyright = '2008-2014, the pandas development team'
+project = "pandas"
+copyright = "2008-2014, the pandas development team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -181,7 +181,7 @@
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
@@ -191,7 +191,7 @@
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'nature_with_gtoc'
+html_theme = "nature_with_gtoc"
# The style sheet to use for HTML and HTML Help pages. A file of that name
# must exist either in Sphinx' static/ path, or in one of the custom paths
@@ -204,7 +204,7 @@
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ['themes']
+html_theme_path = ["themes"]
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
@@ -220,12 +220,12 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-html_favicon = os.path.join(html_static_path[0], 'favicon.ico')
+html_favicon = os.path.join(html_static_path[0], "favicon.ico")
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
@@ -247,60 +247,62 @@
# https://github.com/pandas-dev/pandas/issues/16186
moved_api_pages = [
- ('pandas.core.common.isnull', 'pandas.isna'),
- ('pandas.core.common.notnull', 'pandas.notna'),
- ('pandas.core.reshape.get_dummies', 'pandas.get_dummies'),
- ('pandas.tools.merge.concat', 'pandas.concat'),
- ('pandas.tools.merge.merge', 'pandas.merge'),
- ('pandas.tools.pivot.pivot_table', 'pandas.pivot_table'),
- ('pandas.tseries.tools.to_datetime', 'pandas.to_datetime'),
- ('pandas.io.clipboard.read_clipboard', 'pandas.read_clipboard'),
- ('pandas.io.excel.ExcelFile.parse', 'pandas.ExcelFile.parse'),
- ('pandas.io.excel.read_excel', 'pandas.read_excel'),
- ('pandas.io.gbq.read_gbq', 'pandas.read_gbq'),
- ('pandas.io.html.read_html', 'pandas.read_html'),
- ('pandas.io.json.read_json', 'pandas.read_json'),
- ('pandas.io.parsers.read_csv', 'pandas.read_csv'),
- ('pandas.io.parsers.read_fwf', 'pandas.read_fwf'),
- ('pandas.io.parsers.read_table', 'pandas.read_table'),
- ('pandas.io.pickle.read_pickle', 'pandas.read_pickle'),
- ('pandas.io.pytables.HDFStore.append', 'pandas.HDFStore.append'),
- ('pandas.io.pytables.HDFStore.get', 'pandas.HDFStore.get'),
- ('pandas.io.pytables.HDFStore.put', 'pandas.HDFStore.put'),
- ('pandas.io.pytables.HDFStore.select', 'pandas.HDFStore.select'),
- ('pandas.io.pytables.read_hdf', 'pandas.read_hdf'),
- ('pandas.io.sql.read_sql', 'pandas.read_sql'),
- ('pandas.io.sql.read_frame', 'pandas.read_frame'),
- ('pandas.io.sql.write_frame', 'pandas.write_frame'),
- ('pandas.io.stata.read_stata', 'pandas.read_stata'),
+ ("pandas.core.common.isnull", "pandas.isna"),
+ ("pandas.core.common.notnull", "pandas.notna"),
+ ("pandas.core.reshape.get_dummies", "pandas.get_dummies"),
+ ("pandas.tools.merge.concat", "pandas.concat"),
+ ("pandas.tools.merge.merge", "pandas.merge"),
+ ("pandas.tools.pivot.pivot_table", "pandas.pivot_table"),
+ ("pandas.tseries.tools.to_datetime", "pandas.to_datetime"),
+ ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"),
+ ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"),
+ ("pandas.io.excel.read_excel", "pandas.read_excel"),
+ ("pandas.io.gbq.read_gbq", "pandas.read_gbq"),
+ ("pandas.io.html.read_html", "pandas.read_html"),
+ ("pandas.io.json.read_json", "pandas.read_json"),
+ ("pandas.io.parsers.read_csv", "pandas.read_csv"),
+ ("pandas.io.parsers.read_fwf", "pandas.read_fwf"),
+ ("pandas.io.parsers.read_table", "pandas.read_table"),
+ ("pandas.io.pickle.read_pickle", "pandas.read_pickle"),
+ ("pandas.io.pytables.HDFStore.append", "pandas.HDFStore.append"),
+ ("pandas.io.pytables.HDFStore.get", "pandas.HDFStore.get"),
+ ("pandas.io.pytables.HDFStore.put", "pandas.HDFStore.put"),
+ ("pandas.io.pytables.HDFStore.select", "pandas.HDFStore.select"),
+ ("pandas.io.pytables.read_hdf", "pandas.read_hdf"),
+ ("pandas.io.sql.read_sql", "pandas.read_sql"),
+ ("pandas.io.sql.read_frame", "pandas.read_frame"),
+ ("pandas.io.sql.write_frame", "pandas.write_frame"),
+ ("pandas.io.stata.read_stata", "pandas.read_stata"),
]
# Again, tuples of (from_old, to_new)
moved_classes = [
- ('pandas.tseries.resample.Resampler', 'pandas.core.resample.Resampler'),
- ('pandas.formats.style.Styler', 'pandas.io.formats.style.Styler'),
+ ("pandas.tseries.resample.Resampler", "pandas.core.resample.Resampler"),
+ ("pandas.formats.style.Styler", "pandas.io.formats.style.Styler"),
]
for old, new in moved_classes:
# the class itself...
moved_api_pages.append((old, new))
- mod, classname = new.rsplit('.', 1)
+ mod, classname = new.rsplit(".", 1)
klass = getattr(importlib.import_module(mod), classname)
- methods = [x for x in dir(klass)
- if not x.startswith('_') or x in ('__iter__', '__array__')]
+ methods = [
+ x for x in dir(klass) if not x.startswith("_") or x in ("__iter__", "__array__")
+ ]
for method in methods:
# ... and each of its public methods
moved_api_pages.append(
- ("{old}.{method}".format(old=old, method=method),
- "{new}.{method}".format(new=new, method=method))
+ (
+ "{old}.{method}".format(old=old, method=method),
+ "{new}.{method}".format(new=new, method=method),
+ )
)
if pattern is None:
html_additional_pages = {
- 'generated/' + page[0]: 'api_redirect.html'
- for page in moved_api_pages
+ "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages
}
@@ -313,19 +315,20 @@
import numpy as np
import pandas as pd
- randn = np.random.randn
np.random.seed(123456)
np.set_printoptions(precision=4, suppress=True)
pd.options.display.max_rows = 15
import os
os.chdir(r'{}')
-""".format(os.path.dirname(os.path.dirname(__file__)))
+""".format(
+ os.path.dirname(os.path.dirname(__file__))
+)
html_context = {
- 'redirects': {old: new for old, new in moved_api_pages},
- 'header': header
+ "redirects": {old: new for old, new in moved_api_pages},
+ "header": header,
}
# If false, no module index is generated.
@@ -349,7 +352,7 @@
# html_file_suffix = ''
# Output file base name for HTML help builder.
-htmlhelp_basename = 'pandas'
+htmlhelp_basename = "pandas"
# -- Options for nbsphinx ------------------------------------------------
@@ -368,9 +371,13 @@
# Grouping the document tree into LaTeX files. List of tuples (source start
# file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'pandas.tex',
- 'pandas: powerful Python data analysis toolkit',
- r'Wes McKinney\n\& PyData Development Team', 'manual'),
+ (
+ "index",
+ "pandas.tex",
+ "pandas: powerful Python data analysis toolkit",
+ r"Wes McKinney\n\& PyData Development Team",
+ "manual",
+ )
]
# The name of an image file (relative to this directory) to place at the top of
@@ -393,98 +400,41 @@
if pattern is None:
intersphinx_mapping = {
- 'dateutil': ("https://dateutil.readthedocs.io/en/latest/", None),
- 'matplotlib': ('https://matplotlib.org/', None),
- 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
- 'pandas-gbq': ('https://pandas-gbq.readthedocs.io/en/latest/', None),
- 'py': ('https://pylib.readthedocs.io/en/latest/', None),
- 'python': ('https://docs.python.org/3/', None),
- 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
- 'statsmodels': ('http://www.statsmodels.org/devel/', None),
+ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
+ "matplotlib": ("https://matplotlib.org/", None),
+ "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+ "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
+ "py": ("https://pylib.readthedocs.io/en/latest/", None),
+ "python": ("https://docs.python.org/3/", None),
+ "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
+ "statsmodels": ("http://www.statsmodels.org/devel/", None),
}
# extlinks alias
-extlinks = {'issue': ('https://github.com/pandas-dev/pandas/issues/%s',
- 'GH'),
- 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s',
- 'wiki ')}
+extlinks = {
+ "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"),
+ "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "),
+}
ipython_warning_is_error = False
ipython_exec_lines = [
- 'import numpy as np',
- 'import pandas as pd',
+ "import numpy as np",
+ "import pandas as pd",
# This ensures correct rendering on system with console encoding != utf8
# (windows). It forces pandas to encode its output reprs using utf8
# wherever the docs are built. The docs' target is the browser, not
# the console, so this is fine.
- 'pd.options.display.encoding="utf8"'
+ 'pd.options.display.encoding="utf8"',
]
-def sphinxdocstring_str(self, indent=0, func_role="obj"):
- # Pandas displays Attributes section in style like Methods section
-
- # Function is copy of `SphinxDocString.__str__`
- ns = {
- 'signature': self._str_signature(),
- 'index': self._str_index(),
- 'summary': self._str_summary(),
- 'extended_summary': self._str_extended_summary(),
- 'parameters': self._str_param_list('Parameters'),
- 'returns': self._str_returns('Returns'),
- 'yields': self._str_returns('Yields'),
- 'other_parameters': self._str_param_list('Other Parameters'),
- 'raises': self._str_param_list('Raises'),
- 'warns': self._str_param_list('Warns'),
- 'warnings': self._str_warnings(),
- 'see_also': self._str_see_also(func_role),
- 'notes': self._str_section('Notes'),
- 'references': self._str_references(),
- 'examples': self._str_examples(),
- # Replaced `self._str_param_list('Attributes', fake_autosummary=True)`
- # with `self._str_member_list('Attributes')`
- 'attributes': self._str_member_list('Attributes'),
- 'methods': self._str_member_list('Methods'),
- }
- ns = {k: '\n'.join(v) for k, v in ns.items()}
-
- rendered = self.template.render(**ns)
- return '\n'.join(self._str_indent(rendered.split('\n'), indent))
-
-
-SphinxDocString.__str__ = sphinxdocstring_str
-
-
-# Fix "WARNING: Inline strong start-string without end-string."
-# PR #155 "Escape the * in *args and **kwargs" from numpydoc
-# Can be removed after PR merges in v0.9.0
-def decorate_process_param(func):
- def _escape_args_and_kwargs(name):
- if name[:2] == '**':
- return r'\*\*' + name[2:]
- elif name[:1] == '*':
- return r'\*' + name[1:]
- else:
- return name
-
- def func_wrapper(self, param, desc, fake_autosummary):
- param = _escape_args_and_kwargs(param.strip())
- return func(self, param, desc, fake_autosummary)
-
- return func_wrapper
-
-
-func = SphinxDocString._process_param
-SphinxDocString._process_param = decorate_process_param(func)
-
# Add custom Documenter to handle attributes/methods of an AccessorProperty
# eg pandas.Series.str and pandas.Series.dt (see GH9322)
import sphinx
from sphinx.util import rpartition
-from sphinx.ext.autodoc import (
- Documenter, MethodDocumenter, AttributeDocumenter)
+from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter
from sphinx.ext.autosummary import Autosummary
@@ -492,8 +442,9 @@ class AccessorDocumenter(MethodDocumenter):
"""
Specialized Documenter subclass for accessors.
"""
- objtype = 'accessor'
- directivetype = 'method'
+
+ objtype = "accessor"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -501,7 +452,7 @@ class AccessorDocumenter(MethodDocumenter):
def format_signature(self):
# this method gives an error/warning for the accessors, therefore
# overriding it (accessor has no arguments)
- return ''
+ return ""
class AccessorLevelDocumenter(Documenter):
@@ -509,6 +460,7 @@ class AccessorLevelDocumenter(Documenter):
Specialized Documenter subclass for objects on accessor level (methods,
attributes).
"""
+
# This is the simple straightforward version
# modname is None, base the last elements (eg 'hour')
# and path the part before (eg 'Series.dt')
@@ -521,41 +473,40 @@ class AccessorLevelDocumenter(Documenter):
def resolve_name(self, modname, parents, path, base):
if modname is None:
if path:
- mod_cls = path.rstrip('.')
+ mod_cls = path.rstrip(".")
else:
mod_cls = None
# if documenting a class-level object without path,
# there must be a current class, either from a parent
# auto directive ...
- mod_cls = self.env.temp_data.get('autodoc:class')
+ mod_cls = self.env.temp_data.get("autodoc:class")
# ... or from a class directive
if mod_cls is None:
- mod_cls = self.env.temp_data.get('py:class')
+ mod_cls = self.env.temp_data.get("py:class")
# ... if still None, there's no way to know
if mod_cls is None:
return None, []
# HACK: this is added in comparison to ClassLevelDocumenter
# mod_cls still exists of class.accessor, so an extra
# rpartition is needed
- modname, accessor = rpartition(mod_cls, '.')
- modname, cls = rpartition(modname, '.')
+ modname, accessor = rpartition(mod_cls, ".")
+ modname, cls = rpartition(modname, ".")
parents = [cls, accessor]
# if the module name is still missing, get it like above
if not modname:
- modname = self.env.temp_data.get('autodoc:module')
+ modname = self.env.temp_data.get("autodoc:module")
if not modname:
- if sphinx.__version__ > '1.3':
- modname = self.env.ref_context.get('py:module')
+ if sphinx.__version__ > "1.3":
+ modname = self.env.ref_context.get("py:module")
else:
- modname = self.env.temp_data.get('py:module')
+ modname = self.env.temp_data.get("py:module")
# ... else, it stays None, which means invalid
return modname, parents + [base]
-class AccessorAttributeDocumenter(AccessorLevelDocumenter,
- AttributeDocumenter):
- objtype = 'accessorattribute'
- directivetype = 'attribute'
+class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter):
+ objtype = "accessorattribute"
+ directivetype = "attribute"
# lower than AttributeDocumenter so this is not chosen for normal
# attributes
@@ -563,8 +514,8 @@ class AccessorAttributeDocumenter(AccessorLevelDocumenter,
class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter):
- objtype = 'accessormethod'
- directivetype = 'method'
+ objtype = "accessormethod"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -575,14 +526,15 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter):
This documenter lets us removes .__call__ from the method signature for
callable accessors like Series.plot
"""
- objtype = 'accessorcallable'
- directivetype = 'method'
+
+ objtype = "accessorcallable"
+ directivetype = "method"
# lower than MethodDocumenter; otherwise the doc build prints warnings
priority = 0.5
def format_name(self):
- return MethodDocumenter.format_name(self).rstrip('.__call__')
+ return MethodDocumenter.format_name(self).rstrip(".__call__")
class PandasAutosummary(Autosummary):
@@ -590,15 +542,16 @@ class PandasAutosummary(Autosummary):
This alternative autosummary class lets us override the table summary for
Series.plot and DataFrame.plot in the API docs.
"""
+
def _replace_pandas_items(self, display_name, sig, summary, real_name):
# this a hack: ideally we should extract the signature from the
# .__call__ method instead of hard coding this
- if display_name == 'DataFrame.plot':
- sig = '([x, y, kind, ax, ....])'
- summary = 'DataFrame plotting accessor and method'
- elif display_name == 'Series.plot':
- sig = '([kind, ax, figsize, ....])'
- summary = 'Series plotting accessor and method'
+ if display_name == "DataFrame.plot":
+ sig = "([x, y, kind, ax, ....])"
+ summary = "DataFrame plotting accessor and method"
+ elif display_name == "Series.plot":
+ sig = "([kind, ax, figsize, ....])"
+ summary = "Series plotting accessor and method"
return (display_name, sig, summary, real_name)
@staticmethod
@@ -607,15 +560,15 @@ def _is_deprecated(real_name):
obj, parent, modname = _import_by_name(real_name)
except ImportError:
return False
- doc = NumpyDocString(obj.__doc__ or '')
- summary = ''.join(doc['Summary'] + doc['Extended Summary'])
- return '.. deprecated::' in summary
+ doc = NumpyDocString(obj.__doc__ or "")
+ summary = "".join(doc["Summary"] + doc["Extended Summary"])
+ return ".. deprecated::" in summary
def _add_deprecation_prefixes(self, items):
for item in items:
display_name, sig, summary, real_name = item
if self._is_deprecated(real_name):
- summary = '(DEPRECATED) %s' % summary
+ summary = "(DEPRECATED) %s" % summary
yield display_name, sig, summary, real_name
def get_items(self, names):
@@ -630,18 +583,18 @@ def linkcode_resolve(domain, info):
"""
Determine the URL corresponding to Python object
"""
- if domain != 'py':
+ if domain != "py":
return None
- modname = info['module']
- fullname = info['fullname']
+ modname = info["module"]
+ fullname = info["fullname"]
submod = sys.modules.get(modname)
if submod is None:
return None
obj = submod
- for part in fullname.split('.'):
+ for part in fullname.split("."):
try:
obj = getattr(obj, part)
except AttributeError:
@@ -670,12 +623,14 @@ def linkcode_resolve(domain, info):
fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__))
- if '+' in pandas.__version__:
- return ("http://github.com/pandas-dev/pandas/blob/master/pandas/"
- "{}{}".format(fn, linespec))
+ if "+" in pandas.__version__:
+ return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format(
+ fn, linespec
+ )
else:
- return ("http://github.com/pandas-dev/pandas/blob/"
- "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec))
+ return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format(
+ pandas.__version__, fn, linespec
+ )
# remove the docstring of the flags attribute (inherited from numpy ndarray)
@@ -699,7 +654,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
"""
if what == "class":
- joined = '\n'.join(lines)
+ joined = "\n".join(lines)
templates = [
""".. rubric:: Attributes
@@ -715,25 +670,25 @@ def process_class_docstrings(app, what, name, obj, options, lines):
:toctree:
None
-"""
+""",
]
for template in templates:
if template in joined:
- joined = joined.replace(template, '')
- lines[:] = joined.split('\n')
+ joined = joined.replace(template, "")
+ lines[:] = joined.split("\n")
suppress_warnings = [
# We "overwrite" autosummary with our PandasAutosummary, but
# still want the regular autosummary setup to run. So we just
# suppress this warning.
- 'app.add_directive'
+ "app.add_directive"
]
if pattern:
# When building a single document we don't want to warn because references
# to other documents are unknown, as it's expected
- suppress_warnings.append('ref.ref')
+ suppress_warnings.append("ref.ref")
def rstjinja(app, docname, source):
@@ -742,12 +697,10 @@ def rstjinja(app, docname, source):
"""
# http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/
# Make sure we're outputting HTML
- if app.builder.format != 'html':
+ if app.builder.format != "html":
return
src = source[0]
- rendered = app.builder.templates.render_string(
- src, app.config.html_context
- )
+ rendered = app.builder.templates.render_string(src, app.config.html_context)
source[0] = rendered
@@ -759,4 +712,4 @@ def setup(app):
app.add_autodocumenter(AccessorAttributeDocumenter)
app.add_autodocumenter(AccessorMethodDocumenter)
app.add_autodocumenter(AccessorCallableDocumenter)
- app.add_directive('autosummary', PandasAutosummary)
+ app.add_directive("autosummary", PandasAutosummary)
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index ba0558cff07eb..be6555b2ab936 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -127,35 +127,24 @@ to build the documentation locally before pushing your changes.
.. _contributing.dev_c:
-Installing a C Compiler
+Installing a C compiler
~~~~~~~~~~~~~~~~~~~~~~~
Pandas uses C extensions (mostly written using Cython) to speed up certain
operations. To install pandas from source, you need to compile these C
extensions, which means you need a C compiler. This process depends on which
-platform you're using. Follow the `CPython contributing guide
-`_ for getting a
-compiler installed. You don't need to do any of the ``./configure`` or ``make``
-steps; you only need to install the compiler.
-
-For Windows developers, when using Python 3.5 and later, it is sufficient to
-install `Visual Studio 2017 `_ with the
-**Python development workload** and the **Python native development tools**
-option. Otherwise, the following links may be helpful.
-
-* https://blogs.msdn.microsoft.com/pythonengineering/2017/03/07/python-support-in-vs2017/
-* https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/
-* https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit
-* https://cowboyprogrammer.org/building-python-wheels-for-windows/
-* https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/
-* https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy
+platform you're using.
+
+* Windows: https://devguide.python.org/setup/#windows-compiling
+* Mac: https://devguide.python.org/setup/#macos
+* Unix: https://devguide.python.org/setup/#unix-compiling
Let us know if you have any difficulties by opening an issue or reaching out on
`Gitter`_.
.. _contributing.dev_python:
-Creating a Python Environment
+Creating a Python environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Now that you have a C compiler, create an isolated pandas development
@@ -178,7 +167,6 @@ We'll now kick off a three-step process:
# Create and activate the build environment
conda env create -f environment.yml
conda activate pandas-dev
- conda uninstall --force pandas
# or with older versions of Anaconda:
source activate pandas-dev
@@ -209,7 +197,7 @@ See the full conda docs `here `__.
.. _contributing.pip:
-Creating a Python Environment (pip)
+Creating a Python environment (pip)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you aren't using conda for your development environment, follow these instructions.
@@ -221,7 +209,7 @@ You'll need to have at least python3.5 installed on your system.
# Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev
# Any parent directories should already exist
python3 -m venv ~/virtualenvs/pandas-dev
- # Activate the virtulaenv
+ # Activate the virtualenv
. ~/virtualenvs/pandas-dev/bin/activate
# Install the build dependencies
@@ -289,7 +277,7 @@ complex changes to the documentation as well.
Some other important things to know about the docs:
* The *pandas* documentation consists of two parts: the docstrings in the code
- itself and the docs in this folder ``pandas/doc/``.
+ itself and the docs in this folder ``doc/``.
The docstrings provide a clear explanation of the usage of the individual
functions, while the documentation in this folder consists of tutorial-like
@@ -405,11 +393,11 @@ Building the documentation
~~~~~~~~~~~~~~~~~~~~~~~~~~
So how do you build the docs? Navigate to your local
-``pandas/doc/`` directory in the console and run::
+``doc/`` directory in the console and run::
python make.py html
-Then you can find the HTML output in the folder ``pandas/doc/build/html/``.
+Then you can find the HTML output in the folder ``doc/build/html/``.
The first time you build the docs, it will take quite a while because it has to run
all the code examples and build all the generated docstring pages. In subsequent
@@ -449,7 +437,7 @@ You can also specify to use multiple cores to speed up the documentation build::
Open the following file in a web browser to see the full documentation you
just built::
- pandas/docs/build/html/index.html
+ doc/build/html/index.html
And you'll have the satisfaction of seeing your new and improved documentation!
@@ -460,7 +448,7 @@ Building master branch documentation
When pull requests are merged into the *pandas* ``master`` branch, the main parts of
the documentation are also built by Travis-CI. These docs are then hosted `here
-`__, see also
+`__, see also
the :ref:`Continuous Integration ` section.
.. _contributing.code:
@@ -499,6 +487,21 @@ as possible to avoid mass breakages.
Additional standards are outlined on the `code style wiki
page `_.
+Optional dependencies
+---------------------
+
+Optional dependencies (e.g. matplotlib) should be imported with the private helper
+``pandas.compat._optional.import_optional_dependency``. This ensures a
+consistent error message when the dependency is not met.
+
+All methods using an optional dependency should include a test asserting that an
+``ImportError`` is raised when the optional dependency is not found. This test
+should be skipped if the library is present.
+
+All optional dependencies should be documented in
+:ref:`install.optional_dependencies` and the minimum required version should be
+set in the ``pandas.compat._optional.VERSIONS`` dict.
+
C (cpplint)
~~~~~~~~~~~
@@ -548,23 +551,38 @@ many errors as possible, but it may not correct *all* of them. Thus, it is
recommended that you run ``cpplint`` to double check and make any other style
fixes manually.
-Python (PEP8)
-~~~~~~~~~~~~~
-
-*pandas* uses the `PEP8 `_ standard.
-There are several tools to ensure you abide by this standard. Here are *some* of
-the more common ``PEP8`` issues:
+Python (PEP8 / black)
+~~~~~~~~~~~~~~~~~~~~~
-* we restrict line-length to 79 characters to promote readability
-* passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')``
+*pandas* follows the `PEP8 `_ standard
+and uses `Black `_ and
+`Flake8 `_ to ensure a consistent code
+format throughout the project.
-:ref:`Continuous Integration ` will run
-the `flake8 `_ tool
-and report any stylistic errors in your code. Therefore, it is helpful before
-submitting code to run the check yourself on the diff::
+:ref:`Continuous Integration ` will run those tools and
+report any stylistic errors in your code. Therefore, it is helpful before
+submitting code to run the check yourself::
+ black pandas
git diff upstream/master -u -- "*.py" | flake8 --diff
+to auto-format your code. Additionally, many editors have plugins that will
+apply ``black`` as you edit files.
+
+Optionally, you may wish to setup `pre-commit hooks `_
+to automatically run ``black`` and ``flake8`` when you make a git commit. This
+can be done by installing ``pre-commit``::
+
+ pip install pre-commit
+
+and then running::
+
+ pre-commit install
+
+from the root of the pandas repository. Now ``black`` and ``flake8`` will be run
+each time you commit changes. You can skip these checks with
+``git commit --no-verify``.
+
This command will catch any stylistic errors in your changes specifically, but
be beware it may not catch all of them. For example, if you delete the only
usage of an imported function, it is stylistically incorrect to import an
@@ -590,7 +608,7 @@ and run ``flake8`` on them, one after the other.
.. _contributing.import-formatting:
-Import Formatting
+Import formatting
~~~~~~~~~~~~~~~~~
*pandas* uses `isort `__ to standardise import
formatting across the codebase.
@@ -636,7 +654,7 @@ The `--recursive` flag can be passed to sort all files in a directory.
You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `.
-Backwards Compatibility
+Backwards compatibility
~~~~~~~~~~~~~~~~~~~~~~~
Please try to maintain backward compatibility. *pandas* has lots of users with lots of
@@ -681,10 +699,140 @@ You'll also need to
See :ref:`contributing.warnings` for more.
+.. _contributing.type_hints:
+
+Type Hints
+----------
+
+*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well!
+
+Style Guidelines
+~~~~~~~~~~~~~~~~
+
+Types imports should follow the ``from typing import ...`` convention. So rather than
+
+.. code-block:: python
+
+ import typing
+
+ primes = [] # type: typing.List[int]
+
+You should write
+
+.. code-block:: python
+
+ from typing import List, Optional, Union
+
+ primes = [] # type: List[int]
+
+``Optional`` should be used where applicable, so instead of
+
+.. code-block:: python
+
+ maybe_primes = [] # type: List[Union[int, None]]
+
+You should write
+
+.. code-block:: python
+
+ maybe_primes = [] # type: List[Optional[int]]
+
+In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like
+
+.. code-block:: python
+
+ class SomeClass1:
+ str = None
+
+The appropriate way to annotate this would be as follows
+
+.. code-block:: python
+
+ str_type = str
+
+ class SomeClass2:
+ str = None # type: str_type
+
+In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example
+
+.. code-block:: python
+
+ from typing import cast
+
+ from pandas.core.dtypes.common import is_number
+
+ def cannot_infer_bad(obj: Union[str, int, float]):
+
+ if is_number(obj):
+ ...
+ else: # Reasonably only str objects would reach this but...
+ obj = cast(str, obj) # Mypy complains without this!
+ return obj.upper()
+
+The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
+
+.. code-block:: python
+
+ def cannot_infer_good(obj: Union[str, int, float]):
+
+ if isinstance(obj, str):
+ return obj.upper()
+ else:
+ ...
+
+With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths.
+
+Syntax Requirements
+~~~~~~~~~~~~~~~~~~~
+
+Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas:
+
+.. code-block:: python
+
+ primes = [] # type: List[int]
+
+Whereas this is **NOT** allowed:
+
+.. code-block:: python
+
+ primes: List[int] = [] # not supported in Python 3.5!
+
+Note that function signatures can always be annotated per :pep:`3107`:
+
+.. code-block:: python
+
+ def sum_of_primes(primes: List[int] = []) -> int:
+ ...
+
+
+Pandas-specific Types
+~~~~~~~~~~~~~~~~~~~~~
+
+Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas.
+
+For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module
+
+.. code-block:: python
+
+ from pandas._typing import Dtype
+
+ def as_type(dtype: Dtype) -> ...:
+ ...
+
+This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types.
+
+Validating Type Hints
+~~~~~~~~~~~~~~~~~~~~~
+
+*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running
+
+.. code-block:: shell
+
+ mypy pandas
.. _contributing.ci:
-Testing With Continuous Integration
+Testing with continuous integration
-----------------------------------
The *pandas* test suite will run automatically on `Travis-CI `__ and
@@ -915,7 +1063,7 @@ options or subtle interactions to test (or think of!) all of them.
.. _contributing.warnings:
-Testing Warnings
+Testing warnings
~~~~~~~~~~~~~~~~
By default, one of pandas CI workers will fail if any unhandled warnings are emitted.
diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
index f7e2b42a1ccbd..34bc5f44eb0c0 100644
--- a/doc/source/development/contributing_docstring.rst
+++ b/doc/source/development/contributing_docstring.rst
@@ -522,7 +522,7 @@ examples:
* ``loc`` and ``iloc``, as they do the same, but in one case providing indices
and in the other positions
* ``max`` and ``min``, as they do the opposite
-* ``iterrows``, ``itertuples`` and ``iteritems``, as it is easy that a user
+* ``iterrows``, ``itertuples`` and ``items``, as it is easy that a user
looking for the method to iterate over columns ends up in the method to
iterate over rows, and vice-versa
* ``fillna`` and ``dropna``, as both methods are used to handle missing values
@@ -929,7 +929,7 @@ plot will be generated automatically when building the documentation.
.. _docstring.sharing:
-Sharing Docstrings
+Sharing docstrings
------------------
Pandas has a system for sharing docstrings, with slight variations, between
diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
index a283920ae4377..923ef005d5926 100644
--- a/doc/source/development/developer.rst
+++ b/doc/source/development/developer.rst
@@ -37,12 +37,19 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
.. code-block:: text
- {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
+ {'index_columns': [, , ...],
'column_indexes': [, , ..., ],
'columns': [, , ...],
- 'pandas_version': $VERSION}
+ 'pandas_version': $VERSION,
+ 'creator': {
+ 'library': $LIBRARY,
+ 'version': $LIBRARY_VERSION
+ }}
-Here, ````/```` and so forth are dictionaries containing the metadata
+The "descriptor" values ```` in the ``'index_columns'`` field are
+strings (referring to a column) or dictionaries with values as described below.
+
+The ````/```` and so forth are dictionaries containing the metadata
for each column, *including the index columns*. This has JSON form:
.. code-block:: text
@@ -53,26 +60,37 @@ for each column, *including the index columns*. This has JSON form:
'numpy_type': numpy_type,
'metadata': metadata}
-.. note::
+See below for the detailed specification for these.
+
+Index Metadata Descriptors
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``RangeIndex`` can be stored as metadata only, not requiring serialization. The
+descriptor format for these as is follows:
- Every index column is stored with a name matching the pattern
- ``__index_level_\d+__`` and its corresponding column information is can be
- found with the following code snippet.
+.. code-block:: python
- Following this naming convention isn't strictly necessary, but strongly
- suggested for compatibility with Arrow.
+ index = pd.RangeIndex(0, 10, 2)
+ {'kind': 'range',
+ 'name': index.name,
+ 'start': index.start,
+ 'stop': index.stop,
+ 'step': index.step}
- Here's an example of how the index metadata is structured in pyarrow:
+Other index types must be serialized as data columns along with the other
+DataFrame columns. The metadata for these is a string indicating the name of
+the field in the data columns, for example ``'__index_level_0__'``.
- .. code-block:: python
+If an index has a non-None ``name`` attribute, and there is no other column
+with a name matching that value, then the ``index.name`` value can be used as
+the descriptor. Otherwise (for unnamed indexes and ones with names colliding
+with other column names) a disambiguating name with pattern matching
+``__index_level_\d+__`` should be used. In cases of named indexes as data
+columns, ``name`` attribute is always stored in the column descriptors as
+above.
- # assuming there's at least 3 levels in the index
- index_columns = metadata['index_columns'] # noqa: F821
- columns = metadata['columns'] # noqa: F821
- ith_index = 2
- assert index_columns[ith_index] == '__index_level_2__'
- ith_index_info = columns[-len(index_columns):][ith_index]
- ith_index_level_name = ith_index_info['name']
+Column Metadata
+~~~~~~~~~~~~~~~
``pandas_type`` is the logical type of the column, and is one of:
@@ -161,4 +179,8 @@ As an example of fully-formed metadata:
'numpy_type': 'int64',
'metadata': None}
],
- 'pandas_version': '0.20.0'}
+ 'pandas_version': '0.20.0',
+ 'creator': {
+ 'library': 'pyarrow',
+ 'version': '0.13.0'
+ }}
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index 8bee0452c2207..e341dcb8318bc 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-Extending Pandas
+Extending pandas
****************
While pandas provides a rich set of methods, containers, and data types, your
@@ -12,7 +12,7 @@ pandas.
.. _extending.register-accessors:
-Registering Custom Accessors
+Registering custom accessors
----------------------------
Libraries can use the decorators
@@ -70,7 +70,7 @@ applies only to certain dtypes.
.. _extending.extension-types:
-Extension Types
+Extension types
---------------
.. versionadded:: 0.23.0
@@ -208,9 +208,28 @@ will
2. call ``result = op(values, ExtensionArray)``
3. re-box the result in a ``Series``
+.. _extending.extension.ufunc:
+
+NumPy Universal Functions
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Series` implements ``__array_ufunc__``. As part of the implementation,
+pandas unboxes the ``ExtensionArray`` from the :class:`Series`, applies the ufunc,
+and re-boxes it if necessary.
+
+If applicable, we highly recommend that you implement ``__array_ufunc__`` in your
+extension array to avoid coercion to an ndarray. See
+`the numpy documentation `__
+for an example.
+
+As part of your implementation, we require that you defer to pandas when a pandas
+container (:class:`Series`, :class:`DataFrame`, :class:`Index`) is detected in ``inputs``.
+If any of those is present, you should return ``NotImplemented``. Pandas will take care of
+unboxing the array from the container and re-calling the ufunc with the unwrapped input.
+
.. _extending.extension.testing:
-Testing Extension Arrays
+Testing extension arrays
^^^^^^^^^^^^^^^^^^^^^^^^
We provide a test suite for ensuring that your extension arrays satisfy the expected
@@ -238,7 +257,7 @@ for a list of all the tests available.
.. _extending.subclassing-pandas:
-Subclassing pandas Data Structures
+Subclassing pandas data structures
----------------------------------
.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
@@ -260,7 +279,7 @@ This section describes how to subclass ``pandas`` data structures to meet more s
You can find a nice example in `geopandas `_ project.
-Override Constructor Properties
+Override constructor properties
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Each data structure has several *constructor properties* for returning a new
@@ -348,7 +367,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
>>> type(sliced2)
-Define Original Properties
+Define original properties
^^^^^^^^^^^^^^^^^^^^^^^^^^
To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
@@ -397,3 +416,47 @@ Below is an example to define two original properties, "internal_cache" as a tem
# properties defined in _metadata are retained
>>> df[['A', 'B']].added_property
property
+
+.. _extending.plotting-backends:
+
+Plotting backends
+-----------------
+
+Starting in 0.25 pandas can be extended with third-party plotting backends. The
+main idea is letting users select a plotting backend different than the provided
+one based on Matplotlib. For example:
+
+.. code-block:: python
+
+ >>> pd.set_option('plotting.backend', 'backend.module')
+ >>> pd.Series([1, 2, 3]).plot()
+
+This would be more or less equivalent to:
+
+.. code-block:: python
+
+ >>> import backend.module
+ >>> backend.module.plot(pd.Series([1, 2, 3]))
+
+The backend module can then use other visualization tools (Bokeh, Altair,...)
+to generate the plots.
+
+Libraries implementing the plotting backend should use `entry points `__
+to make their backend discoverable to pandas. The key is ``"pandas_plotting_backends"``. For example, pandas
+registers the default "matplotlib" backend as follows.
+
+.. code-block:: python
+
+ # in setup.py
+ setup( # noqa: F821
+ ...,
+ entry_points={
+ "pandas_plotting_backends": [
+ "matplotlib = pandas:plotting._matplotlib",
+ ],
+ },
+ )
+
+
+More information on how to implement a third-party plotting backend can be found at
+https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst
index a149f31118ed5..c7710ff19f078 100644
--- a/doc/source/development/index.rst
+++ b/doc/source/development/index.rst
@@ -16,3 +16,4 @@ Development
internals
extending
developer
+ roadmap
diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst
index 9c434928c214e..748caae295460 100644
--- a/doc/source/development/internals.rst
+++ b/doc/source/development/internals.rst
@@ -102,7 +102,7 @@ So, for example, ``Series[category]._values`` is a ``Categorical``, while
.. _ref-subclassing-pandas:
-Subclassing pandas Data Structures
+Subclassing pandas data structures
----------------------------------
This section has been moved to :ref:`extending.subclassing-pandas`.
diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst
new file mode 100644
index 0000000000000..00598830e2fe9
--- /dev/null
+++ b/doc/source/development/roadmap.rst
@@ -0,0 +1,193 @@
+.. _roadmap:
+
+=======
+Roadmap
+=======
+
+This page provides an overview of the major themes in pandas' development. Each of
+these items requires a relatively large amount of effort to implement. These may
+be achieved more quickly with dedicated funding or interest from contributors.
+
+An item being on the roadmap does not mean that it will *necessarily* happen, even
+with unlimited funding. During the implementation period we may discover issues
+preventing the adoption of the feature.
+
+Additionally, an item *not* being on the roadmap does not exclude it from inclusion
+in pandas. The roadmap is intended for larger, fundamental changes to the project that
+are likely to take months or years of developer time. Smaller-scoped items will continue
+to be tracked on our `issue tracker `__.
+
+See :ref:`roadmap.evolution` for proposing changes to this document.
+
+Extensibility
+-------------
+
+Pandas :ref:`extending.extension-types` allow for extending NumPy types with custom
+data types and array storage. Pandas uses extension types internally, and provides
+an interface for 3rd-party libraries to define their own custom data types.
+
+Many parts of pandas still unintentionally convert data to a NumPy array.
+These problems are especially pronounced for nested data.
+
+We'd like to improve the handling of extension arrays throughout the library,
+making their behavior more consistent with the handling of NumPy arrays. We'll do this
+by cleaning up pandas' internals and adding new methods to the extension array interface.
+
+String data type
+----------------
+
+Currently, pandas stores text data in an ``object`` -dtype NumPy array.
+The current implementation has two primary drawbacks: First, ``object`` -dtype
+is not specific to strings: any Python object can be stored in an ``object`` -dtype
+array, not just strings. Second: this is not efficient. The NumPy memory model
+isn't especially well-suited to variable width text data.
+
+To solve the first issue, we propose a new extension type for string data. This
+will initially be opt-in, with users explicitly requesting ``dtype="string"``.
+The array backing this string dtype may initially be the current implementation:
+an ``object`` -dtype NumPy array of Python strings.
+
+To solve the second issue (performance), we'll explore alternative in-memory
+array libraries (for example, Apache Arrow). As part of the work, we may
+need to implement certain operations expected by pandas users (for example
+the algorithm used in, ``Series.str.upper``). That work may be done outside of
+pandas.
+
+Apache Arrow interoperability
+-----------------------------
+
+`Apache Arrow `__ is a cross-language development
+platform for in-memory data. The Arrow logical types are closely aligned with
+typical pandas use cases.
+
+We'd like to provide better-integrated support for Arrow memory and data types
+within pandas. This will let us take advantage of its I/O capabilities and
+provide for better interoperability with other languages and libraries
+using Arrow.
+
+Block manager rewrite
+---------------------
+
+We'd like to replace pandas current internal data structures (a collection of
+1 or 2-D arrays) with a simpler collection of 1-D arrays.
+
+Pandas internal data model is quite complex. A DataFrame is made up of
+one or more 2-dimensional "blocks", with one or more blocks per dtype. This
+collection of 2-D arrays is managed by the BlockManager.
+
+The primary benefit of the BlockManager is improved performance on certain
+operations (construction from a 2D array, binary operations, reductions across the columns),
+especially for wide DataFrames. However, the BlockManager substantially increases the
+complexity and maintenance burden of pandas.
+
+By replacing the BlockManager we hope to achieve
+
+* Substantially simpler code
+* Easier extensibility with new logical types
+* Better user control over memory use and layout
+* Improved micro-performance
+* Option to provide a C / Cython API to pandas' internals
+
+See `these design documents `__
+for more.
+
+Decoupling of indexing and internals
+------------------------------------
+
+The code for getting and setting values in pandas' data structures needs refactoring.
+In particular, we must clearly separate code that converts keys (e.g., the argument
+to ``DataFrame.loc``) to positions from code that uses these positions to get
+or set values. This is related to the proposed BlockManager rewrite. Currently, the
+BlockManager sometimes uses label-based, rather than position-based, indexing.
+We propose that it should only work with positional indexing, and the translation of keys
+to positions should be entirely done at a higher level.
+
+Indexing is a complicated API with many subtleties. This refactor will require care
+and attention. More details are discussed at
+https://github.com/pandas-dev/pandas/wiki/(Tentative)-rules-for-restructuring-indexing-code
+
+Numba-accelerated operations
+----------------------------
+
+`Numba `__ is a JIT compiler for Python code. We'd like to provide
+ways for users to apply their own Numba-jitted functions where pandas accepts user-defined functions
+(for example, :meth:`Series.apply`, :meth:`DataFrame.apply`, :meth:`DataFrame.applymap`,
+and in groupby and window contexts). This will improve the performance of
+user-defined-functions in these operations by staying within compiled code.
+
+
+Documentation improvements
+--------------------------
+
+We'd like to improve the content, structure, and presentation of the pandas documentation.
+Some specific goals include
+
+* Overhaul the HTML theme with a modern, responsive design (:issue:`15556`)
+* Improve the "Getting Started" documentation, designing and writing learning paths
+ for users different backgrounds (e.g. brand new to programming, familiar with
+ other languages like R, already familiar with Python).
+* Improve the overall organization of the documentation and specific subsections
+ of the documentation to make navigation and finding content easier.
+
+Package docstring validation
+----------------------------
+
+To improve the quality and consistency of pandas docstrings, we've developed
+tooling to check docstrings in a variety of ways.
+https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py
+contains the checks.
+
+Like many other projects, pandas uses the
+`numpydoc `__ style for writing
+docstrings. With the collaboration of the numpydoc maintainers, we'd like to
+move the checks to a package other than pandas so that other projects can easily
+use them as well.
+
+Performance monitoring
+----------------------
+
+Pandas uses `airspeed velocity `__ to
+monitor for performance regressions. ASV itself is a fabulous tool, but requires
+some additional work to be integrated into an open source project's workflow.
+
+The `asv-runner `__ organization, currently made up
+of pandas maintainers, provides tools built on top of ASV. We have a physical
+machine for running a number of project's benchmarks, and tools managing the
+benchmark runs and reporting on results.
+
+We'd like to fund improvements and maintenance of these tools to
+
+* Be more stable. Currently, they're maintained on the nights and weekends when
+ a maintainer has free time.
+* Tune the system for benchmarks to improve stability, following
+ https://pyperf.readthedocs.io/en/latest/system.html
+* Build a GitHub bot to request ASV runs *before* a PR is merged. Currently, the
+ benchmarks are only run nightly.
+
+.. _roadmap.evolution:
+
+Roadmap Evolution
+-----------------
+
+Pandas continues to evolve. The direction is primarily determined by community
+interest. Everyone is welcome to review existing items on the roadmap and
+to propose a new item.
+
+Each item on the roadmap should be a short summary of a larger design proposal.
+The proposal should include
+
+1. Short summary of the changes, which would be appropriate for inclusion in
+ the roadmap if accepted.
+2. Motivation for the changes.
+3. An explanation of why the change is in scope for pandas.
+4. Detailed design: Preferably with example-usage (even if not implemented yet)
+ and API documentation
+5. API Change: Any API changes that may result from the proposal.
+
+That proposal may then be submitted as a GitHub issue, where the pandas maintainers
+can review and comment on the design. The `pandas mailing list `__
+should be notified of the proposal.
+
+When there's agreement that an implementation
+would be welcome, the roadmap should be updated to include the summary and a
+link to the discussion issue.
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index b1a5430752558..b1e3d8dc8a1ad 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-pandas Ecosystem
+Pandas ecosystem
****************
Increasingly, packages are being built on top of pandas to address specific needs
@@ -26,7 +26,7 @@ substantial projects that you feel should be on this list, please let us know.
.. _ecosystem.stats:
-Statistics and Machine Learning
+Statistics and machine learning
-------------------------------
`Statsmodels `__
@@ -72,6 +72,17 @@ the latest web technologies. Its goal is to provide elegant, concise constructio
graphics in the style of Protovis/D3, while delivering high-performance interactivity over
large data to thin clients.
+`Pandas-Bokeh `__ provides a high level API
+for Bokeh that can be loaded as a native Pandas plotting backend via
+
+.. code:: python
+
+ pd.set_option("plotting.backend", "pandas_bokeh")
+
+It is very similar to the matplotlib plotting backend, but provides interactive
+web-based charts and maps.
+
+
`seaborn `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -243,7 +254,7 @@ you can obtain for free on the FRED website.
.. _ecosystem.domain:
-Domain Specific
+Domain specific
---------------
`Geopandas `__
@@ -332,7 +343,7 @@ and check that they're *actually* true.
.. _ecosystem.extensions:
-Extension Data Types
+Extension data types
--------------------
Pandas provides an interface for defining
diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst
index fdf1f05b8e61f..41520795bde62 100644
--- a/doc/source/getting_started/10min.rst
+++ b/doc/source/getting_started/10min.rst
@@ -3,7 +3,7 @@
{{ header }}
********************
-10 Minutes to pandas
+10 minutes to pandas
********************
This is a short introduction to pandas, geared mainly for new users.
@@ -16,7 +16,7 @@ Customarily, we import as follows:
import numpy as np
import pandas as pd
-Object Creation
+Object creation
---------------
See the :ref:`Data Structure Intro section `.
@@ -83,7 +83,7 @@ As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
tab completed. ``E`` is there as well; the rest of the attributes have been
truncated for brevity.
-Viewing Data
+Viewing data
------------
See the :ref:`Basics section `.
@@ -183,7 +183,7 @@ Selecting via ``[]``, which slices the rows.
df[0:3]
df['20130102':'20130104']
-Selection by Label
+Selection by label
~~~~~~~~~~~~~~~~~~
See more in :ref:`Selection by Label `.
@@ -224,7 +224,7 @@ For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0], 'A']
-Selection by Position
+Selection by position
~~~~~~~~~~~~~~~~~~~~~
See more in :ref:`Selection by Position `.
@@ -271,14 +271,14 @@ For getting fast access to a scalar (equivalent to the prior method):
df.iat[1, 1]
-Boolean Indexing
+Boolean indexing
~~~~~~~~~~~~~~~~
Using a single column's values to select data.
.. ipython:: python
- df[df.A > 0]
+ df[df['A'] > 0]
Selecting values from a DataFrame where a boolean condition is met.
@@ -340,7 +340,7 @@ A ``where`` operation with setting.
df2
-Missing Data
+Missing data
------------
pandas primarily uses the value ``np.nan`` to represent missing data. It is by
@@ -468,6 +468,13 @@ Concatenating pandas objects together with :func:`concat`:
pd.concat(pieces)
+.. note::
+ Adding a column to a ``DataFrame`` is relatively fast. However, adding
+ a row requires a copy, and may be expensive. We recommend passing a
+ pre-built list of records to the ``DataFrame`` constructor instead
+ of building a ``DataFrame`` by iteratively appending records to it.
+ See :ref:`Appending to dataframe ` for more.
+
Join
~~~~
@@ -491,21 +498,6 @@ Another example that can be given is:
right
pd.merge(left, right, on='key')
-
-Append
-~~~~~~
-
-Append rows to a dataframe. See the :ref:`Appending `
-section.
-
-.. ipython:: python
-
- df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
- df
- s = df.iloc[3]
- df.append(s, ignore_index=True)
-
-
Grouping
--------
@@ -580,7 +572,7 @@ With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the
stacked.unstack(1)
stacked.unstack(0)
-Pivot Tables
+Pivot tables
~~~~~~~~~~~~
See the section on :ref:`Pivot Tables `.
@@ -600,7 +592,7 @@ We can produce pivot tables from this data very easily:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
-Time Series
+Time series
-----------
pandas has simple, powerful, and efficient functionality for performing
@@ -734,7 +726,7 @@ of the columns with labels:
@savefig frame_plot_basic.png
plt.legend(loc='best')
-Getting Data In/Out
+Getting data in/out
-------------------
CSV
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 5ec0094de0a91..802ffadf2a81e 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -3,7 +3,7 @@
{{ header }}
==============================
- Essential Basic Functionality
+ Essential basic functionality
==============================
Here we discuss a lot of the essential functionality common to the pandas data
@@ -19,7 +19,7 @@ the previous section:
.. _basics.head_tail:
-Head and Tail
+Head and tail
-------------
To view a small sample of a Series or DataFrame object, use the
@@ -34,7 +34,7 @@ of elements to display is five, but you may pass a custom number.
.. _basics.attrs:
-Attributes and Underlying Data
+Attributes and underlying data
------------------------------
pandas objects have a number of attributes enabling you to access the metadata
@@ -286,7 +286,7 @@ using ``fillna`` if you wish).
.. _basics.compare:
-Flexible Comparisons
+Flexible comparisons
~~~~~~~~~~~~~~~~~~~~
Series and DataFrame have the binary comparison methods ``eq``, ``ne``, ``lt``, ``gt``,
@@ -304,7 +304,7 @@ indexing operations, see the section on :ref:`Boolean indexing
.. _basics.reductions:
-Boolean Reductions
+Boolean reductions
~~~~~~~~~~~~~~~~~~
You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`,
@@ -468,7 +468,7 @@ which we illustrate:
df2
df1.combine_first(df2)
-General DataFrame Combine
+General DataFrame combine
~~~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~DataFrame.combine_first` method above calls the more general
@@ -643,7 +643,7 @@ there for details about accepted inputs.
.. _basics.idxmin:
-Index of Min/Max Values
+Index of min/max values
~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` functions on Series
@@ -677,7 +677,7 @@ matching index:
.. _basics.discretization:
-Value counts (histogramming) / Mode
+Value counts (histogramming) / mode
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~Series.value_counts` Series method and top-level function computes a histogram
@@ -752,7 +752,7 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise.
.. _basics.pipe:
-Tablewise Function Application
+Tablewise function application
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``DataFrames`` and ``Series`` can of course just be passed into functions.
@@ -784,6 +784,7 @@ In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``.
For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``:
.. ipython:: python
+ :okwarning:
import statsmodels.formula.api as sm
@@ -806,7 +807,7 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`.
.. _R: https://www.r-project.org
-Row or Column-wise Function Application
+Row or column-wise function application
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Arbitrary functions can be applied along the axes of a DataFrame
@@ -925,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value:
.. ipython:: python
- tsdf.A.agg('sum')
+ tsdf['A'].agg('sum')
Aggregating with multiple functions
@@ -949,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function
.. ipython:: python
- tsdf.A.agg(['sum', 'mean'])
+ tsdf['A'].agg(['sum', 'mean'])
Passing a ``lambda`` function will yield a ```` named row:
.. ipython:: python
- tsdf.A.agg(['sum', lambda x: x.mean()])
+ tsdf['A'].agg(['sum', lambda x: x.mean()])
Passing a named function will yield that name for the row:
@@ -964,7 +965,7 @@ Passing a named function will yield that name for the row:
def mymean(x):
return x.mean()
- tsdf.A.agg(['sum', mymean])
+ tsdf['A'].agg(['sum', mymean])
Aggregating with a dict
+++++++++++++++++++++++
@@ -987,7 +988,7 @@ not noted for a particular column will be ``NaN``:
.. _basics.aggregation.mixed_dtypes:
-Mixed Dtypes
+Mixed dtypes
++++++++++++
When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
@@ -1064,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin
.. ipython:: python
- tsdf.A.transform(np.abs)
+ tsdf['A'].transform(np.abs)
Transform with multiple functions
@@ -1083,7 +1084,7 @@ resulting column names will be the transforming functions.
.. ipython:: python
- tsdf.A.transform([np.abs, lambda x: x + 1])
+ tsdf['A'].transform([np.abs, lambda x: x + 1])
Transforming with a dict
@@ -1106,7 +1107,7 @@ selective transforms.
.. _basics.elementwise:
-Applying Elementwise Functions
+Applying elementwise functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since not all functions can be vectorized (accept NumPy arrays and return
@@ -1421,8 +1422,6 @@ The :meth:`~DataFrame.rename` method also provides an ``inplace`` named
parameter that is by default ``False`` and copies the underlying data. Pass
``inplace=True`` to rename the data in place.
-.. versionadded:: 0.18.0
-
Finally, :meth:`~Series.rename` also accepts a scalar or list-like
for altering the ``Series.name`` attribute.
@@ -1474,7 +1473,7 @@ Thus, for example, iterating over a DataFrame gives you the column names:
print(col)
-Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to
+Pandas objects also have the dict-like :meth:`~DataFrame.items` method to
iterate over the (key, value) pairs.
To iterate over the rows of a DataFrame, you can use the following methods:
@@ -1523,10 +1522,10 @@ To iterate over the rows of a DataFrame, you can use the following methods:
df
-iteritems
-~~~~~~~~~
+items
+~~~~~
-Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates
+Consistent with the dict-like interface, :meth:`~DataFrame.items` iterates
through key-value pairs:
* **Series**: (index, scalar value) pairs
@@ -1536,7 +1535,7 @@ For example:
.. ipython:: python
- for label, ser in df.iteritems():
+ for label, ser in df.items():
print(label)
print(ser)
@@ -1726,7 +1725,7 @@ sorting by column values, and sorting by a combination of both.
.. _basics.sort_index:
-By Index
+By index
~~~~~~~~
The :meth:`Series.sort_index` and :meth:`DataFrame.sort_index` methods are
@@ -1753,7 +1752,7 @@ used to sort a pandas object by its index levels.
.. _basics.sort_values:
-By Values
+By values
~~~~~~~~~
The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The
@@ -1785,7 +1784,7 @@ argument:
.. _basics.sort_indexes_and_values:
-By Indexes and Values
+By indexes and values
~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.23.0
@@ -1968,11 +1967,11 @@ dtype of the column will be chosen to accommodate all of the data types
pd.Series([1, 2, 3, 6., 'foo'])
The number of columns of each type in a ``DataFrame`` can be found by calling
-:meth:`~DataFrame.get_dtype_counts`.
+``DataFrame.dtypes.value_counts()``.
.. ipython:: python
- dft.get_dtype_counts()
+ dft.dtypes.value_counts()
Numeric dtypes will propagate and can coexist in DataFrames.
If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``,
@@ -2062,8 +2061,6 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`.
dft
dft.dtypes
-.. versionadded:: 0.19.0
-
Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFrame.astype`.
.. ipython:: python
diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
index 2957430666b8a..f67f46fc2b29b 100644
--- a/doc/source/getting_started/comparison/comparison_with_r.rst
+++ b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -26,7 +26,7 @@ use HDF5 files, see :ref:`io.external_compatibility` for an
example.
-Quick Reference
+Quick reference
---------------
We'll start off with a quick reference guide pairing some common R
@@ -35,7 +35,7 @@ operations using `dplyr
pandas equivalents.
-Querying, Filtering, Sampling
+Querying, filtering, sampling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=========================================== ===========================================
@@ -81,11 +81,11 @@ R pandas
=========================================== ===========================================
``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']``
``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})``
-``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)``
+``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])``
=========================================== ===========================================
-Grouping and Summarizing
+Grouping and summarizing
~~~~~~~~~~~~~~~~~~~~~~~~
============================================== ===========================================
@@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing:
df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
df.query('a <= b')
- df[df.a <= df.b]
- df.loc[df.a <= df.b]
+ df[df['a'] <= df['b']]
+ df.loc[df['a'] <= df['b']]
For more details and examples see :ref:`the query documentation
`.
@@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the
df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
df.eval('a + b')
- df.a + df.b # same as the previous expression
+ df['a'] + df['b'] # same as the previous expression
In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
evaluation in pure Python. For more details and examples see :ref:`the eval
diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst
index fc12c8524d3bf..69bb700c97b15 100644
--- a/doc/source/getting_started/comparison/comparison_with_sas.rst
+++ b/doc/source/getting_started/comparison/comparison_with_sas.rst
@@ -31,10 +31,10 @@ As is customary, we import pandas and NumPy as follows:
proc print data=df(obs=5);
run;
-Data Structures
+Data structures
---------------
-General Terminology Translation
+General terminology translation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. csv-table::
@@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an
``Index`` effectively.
-Data Input / Output
+Data input / output
-------------------
-Constructing a DataFrame from Values
+Constructing a DataFrame from values
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A SAS data set can be built from specified values by
@@ -110,7 +110,7 @@ and the values are the data.
df
-Reading External Data
+Reading external data
~~~~~~~~~~~~~~~~~~~~~
Like SAS, pandas provides utilities for reading in data from
@@ -151,7 +151,7 @@ In addition to text/csv, pandas supports a variety of other data formats
such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*``
function. See the :ref:`IO documentation` for more details.
-Exporting Data
+Exporting data
~~~~~~~~~~~~~~
The inverse of ``PROC IMPORT`` in SAS is ``PROC EXPORT``
@@ -169,10 +169,10 @@ and other data formats follow a similar api.
tips.to_csv('tips2.csv')
-Data Operations
+Data operations
---------------
-Operations on Columns
+Operations on columns
~~~~~~~~~~~~~~~~~~~~~
In the ``DATA`` step, arbitrary math expressions can
@@ -228,7 +228,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin
tips[tips['total_bill'] > 10].head()
-If/Then Logic
+If/then logic
~~~~~~~~~~~~~
In SAS, if/then logic can be used to create new columns.
@@ -256,7 +256,7 @@ the ``where`` method from ``numpy``.
tips = tips.drop('bucket', axis=1)
-Date Functionality
+Date functionality
~~~~~~~~~~~~~~~~~~
SAS provides a variety of functions to do operations on
@@ -301,7 +301,7 @@ see the :ref:`timeseries documentation` for more details.
tips = tips.drop(['date1', 'date2', 'date1_year',
'date2_month', 'date1_next', 'months_between'], axis=1)
-Selection of Columns
+Selection of columns
~~~~~~~~~~~~~~~~~~~~
SAS provides keywords in the ``DATA`` step to select,
@@ -338,7 +338,7 @@ The same operations are expressed in pandas below.
tips.rename(columns={'total_bill': 'total_bill_2'}).head()
-Sorting by Values
+Sorting by values
~~~~~~~~~~~~~~~~~
Sorting in SAS is accomplished via ``PROC SORT``
@@ -358,7 +358,7 @@ takes a list of columns to sort by.
tips.head()
-String Processing
+String processing
-----------------
Length
@@ -466,7 +466,7 @@ approaches, but this just shows a simple approach.
firstlast
-Upcase, Lowcase, and Propcase
+Upcase, lowcase, and propcase
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The SAS `UPCASE `__
@@ -555,7 +555,7 @@ types are accomplished via the ``how`` keyword.
outer_join
-Missing Data
+Missing data
------------
Like SAS, pandas has a representation for missing data - which is the
@@ -660,7 +660,7 @@ example, to subtract the mean for each observation by smoker group.
run;
-pandas ``groubpy`` provides a ``transform`` mechanism that allows
+pandas ``groupby`` provides a ``transform`` mechanism that allows
these type of operations to be succinctly expressed in one
operation.
@@ -671,7 +671,7 @@ operation.
tips.head()
-By Group Processing
+By group processing
~~~~~~~~~~~~~~~~~~~
In addition to aggregation, pandas ``groupby`` can be used to
@@ -701,7 +701,7 @@ In pandas this would be written as:
Other Considerations
--------------------
-Disk vs Memory
+Disk vs memory
~~~~~~~~~~~~~~
pandas operates exclusively in memory, where a SAS data set exists on disk.
@@ -713,7 +713,7 @@ If out of core processing is needed, one possibility is the
library (currently in development) which
provides a subset of pandas functionality for an on-disk ``DataFrame``
-Data Interop
+Data interop
~~~~~~~~~~~~
pandas provides a :func:`read_sas` method that can read SAS data saved in
diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst
index bf2b03176ecd8..db687386329bb 100644
--- a/doc/source/getting_started/comparison/comparison_with_stata.rst
+++ b/doc/source/getting_started/comparison/comparison_with_stata.rst
@@ -31,10 +31,10 @@ libraries as ``pd`` and ``np``, respectively, for the rest of the document.
list in 1/5
-Data Structures
+Data structures
---------------
-General Terminology Translation
+General terminology translation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. csv-table::
@@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an
``Index`` effectively.
-Data Input / Output
+Data input / output
-------------------
-Constructing a DataFrame from Values
+Constructing a DataFrame from values
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A Stata data set can be built from specified values by
@@ -107,7 +107,7 @@ and the values are the data.
df
-Reading External Data
+Reading external data
~~~~~~~~~~~~~~~~~~~~~
Like Stata, pandas provides utilities for reading in data from
@@ -155,7 +155,7 @@ such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a
function. See the :ref:`IO documentation` for more details.
-Exporting Data
+Exporting data
~~~~~~~~~~~~~~
The inverse of ``import delimited`` in Stata is ``export delimited``
@@ -177,10 +177,10 @@ Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata`
tips.to_stata('tips2.dta')
-Data Operations
+Data operations
---------------
-Operations on Columns
+Operations on columns
~~~~~~~~~~~~~~~~~~~~~
In Stata, arbitrary math expressions can be used with the ``generate`` and
@@ -222,7 +222,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin
tips[tips['total_bill'] > 10].head()
-If/Then Logic
+If/then logic
~~~~~~~~~~~~~
In Stata, an ``if`` clause can also be used to create new columns.
@@ -245,7 +245,7 @@ the ``where`` method from ``numpy``.
tips = tips.drop('bucket', axis=1)
-Date Functionality
+Date functionality
~~~~~~~~~~~~~~~~~~
Stata provides a variety of functions to do operations on
@@ -290,7 +290,7 @@ see the :ref:`timeseries documentation` for more details.
tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month',
'date1_next', 'months_between'], axis=1)
-Selection of Columns
+Selection of columns
~~~~~~~~~~~~~~~~~~~~
Stata provides keywords to select, drop, and rename columns.
@@ -319,7 +319,7 @@ to a variable.
tips.rename(columns={'total_bill': 'total_bill_2'}).head()
-Sorting by Values
+Sorting by values
~~~~~~~~~~~~~~~~~
Sorting in Stata is accomplished via ``sort``
@@ -337,10 +337,10 @@ takes a list of columns to sort by.
tips.head()
-String Processing
+String processing
-----------------
-Finding Length of String
+Finding length of string
~~~~~~~~~~~~~~~~~~~~~~~~
Stata determines the length of a character string with the :func:`strlen` and
@@ -361,7 +361,7 @@ Use ``len`` and ``rstrip`` to exclude trailing blanks.
tips['time'].str.rstrip().str.len().head()
-Finding Position of Substring
+Finding position of substring
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Stata determines the position of a character in a string with the :func:`strpos` function.
@@ -383,7 +383,7 @@ the function will return -1 if it fails to find the substring.
tips['sex'].str.find("ale").head()
-Extracting Substring by Position
+Extracting substring by position
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Stata extracts a substring from a string based on its position with the :func:`substr` function.
@@ -401,7 +401,7 @@ indexes are zero-based.
tips['sex'].str[0:1].head()
-Extracting nth Word
+Extracting nth word
~~~~~~~~~~~~~~~~~~~
The Stata :func:`word` function returns the nth word from a string.
@@ -431,7 +431,7 @@ approaches, but this just shows a simple approach.
firstlast
-Changing Case
+Changing case
~~~~~~~~~~~~~
The Stata :func:`strupper`, :func:`strlower`, :func:`strproper`,
@@ -547,7 +547,7 @@ types are accomplished via the ``how`` keyword.
outer_join
-Missing Data
+Missing data
------------
Like Stata, pandas has a representation for missing data -- the
@@ -634,7 +634,7 @@ For example, to subtract the mean for each observation by smoker group.
generate adj_total_bill = total_bill - group_bill
-pandas ``groubpy`` provides a ``transform`` mechanism that allows
+pandas ``groupby`` provides a ``transform`` mechanism that allows
these type of operations to be succinctly expressed in one
operation.
@@ -645,7 +645,7 @@ operation.
tips.head()
-By Group Processing
+By group processing
~~~~~~~~~~~~~~~~~~~
In addition to aggregation, pandas ``groupby`` can be used to
@@ -664,10 +664,10 @@ In pandas this would be written as:
tips.groupby(['sex', 'smoker']).first()
-Other Considerations
+Other considerations
--------------------
-Disk vs Memory
+Disk vs memory
~~~~~~~~~~~~~~
Pandas and Stata both operate exclusively in memory. This means that the size of
diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst
index 1abca7ac393dd..9e18951fe3f4c 100644
--- a/doc/source/getting_started/dsintro.rst
+++ b/doc/source/getting_started/dsintro.rst
@@ -3,7 +3,7 @@
{{ header }}
************************
-Intro to Data Structures
+Intro to data structures
************************
We'll start with a quick, non-comprehensive overview of the fundamental data
@@ -251,8 +251,6 @@ Series can also have a ``name`` attribute:
The Series ``name`` will be assigned automatically in many cases, in particular
when taking 1D slices of DataFrame as you will see below.
-.. versionadded:: 0.18.0
-
You can rename a Series with the :meth:`pandas.Series.rename` method.
.. ipython:: python
@@ -399,7 +397,7 @@ The result will be a DataFrame with the same index as the input Series, and
with one column whose name is the original name of the Series (only if no other
column name provided).
-**Missing Data**
+**Missing data**
Much more will be said on this topic in the :ref:`Missing data `
section. To construct a DataFrame with missing data, we use ``np.nan`` to
@@ -407,7 +405,7 @@ represent missing values. Alternatively, you may pass a ``numpy.MaskedArray``
as the data argument to the DataFrame constructor, and its masked entries will
be considered missing.
-Alternate Constructors
+Alternate constructors
~~~~~~~~~~~~~~~~~~~~~~
.. _basics.dataframe.from_dict:
@@ -498,7 +496,7 @@ available to insert at a particular location in the columns:
.. _dsintro.chained_assignment:
-Assigning New Columns in Method Chains
+Assigning new columns in method chains
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Inspired by `dplyr's
@@ -614,7 +612,7 @@ To write code compatible with all versions of Python, split the assignment in tw
-Indexing / Selection
+Indexing / selection
~~~~~~~~~~~~~~~~~~~~
The basics of indexing are as follows:
@@ -731,28 +729,67 @@ DataFrame interoperability with NumPy functions
.. _dsintro.numpy_interop:
Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions
-can be used with no issues on DataFrame, assuming the data within are numeric:
+can be used with no issues on Series and DataFrame, assuming the data within
+are numeric:
.. ipython:: python
np.exp(df)
np.asarray(df)
-The dot method on DataFrame implements matrix multiplication:
+DataFrame is not intended to be a drop-in replacement for ndarray as its
+indexing semantics and data model are quite different in places from an n-dimensional
+array.
+
+:class:`Series` implements ``__array_ufunc__``, which allows it to work with NumPy's
+`universal functions `_.
+
+The ufunc is applied to the underlying array in a Series.
.. ipython:: python
- df.T.dot(df)
+ ser = pd.Series([1, 2, 3, 4])
+ np.exp(ser)
-Similarly, the dot method on Series implements dot product:
+.. versionchanged:: 0.25.0
+
+ When multiple ``Series`` are passed to a ufunc, they are aligned before
+ performing the operation.
+
+Like other parts of the library, pandas will automatically align labeled inputs
+as part of a ufunc with multiple inputs. For example, using :meth:`numpy.remainder`
+on two :class:`Series` with differently ordered labels will align before the operation.
.. ipython:: python
- s1 = pd.Series(np.arange(5, 10))
- s1.dot(s1)
+ ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
+ ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c'])
+ ser1
+ ser2
+ np.remainder(ser1, ser2)
-DataFrame is not intended to be a drop-in replacement for ndarray as its
-indexing semantics are quite different in places from a matrix.
+As usual, the union of the two indices is taken, and non-overlapping values are filled
+with missing values.
+
+.. ipython:: python
+
+ ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd'])
+ ser3
+ np.remainder(ser1, ser3)
+
+When a binary ufunc is applied to a :class:`Series` and :class:`Index`, the Series
+implementation takes precedence and a Series is returned.
+
+.. ipython:: python
+
+ ser = pd.Series([1, 2, 3])
+ idx = pd.Index([4, 5, 6])
+
+ np.maximum(ser, idx)
+
+NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays,
+for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible,
+the ufunc is applied without converting the underlying data to an ndarray.
Console display
~~~~~~~~~~~~~~~
diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst
index b531f686951fc..ec76c60f24257 100644
--- a/doc/source/getting_started/overview.rst
+++ b/doc/source/getting_started/overview.rst
@@ -81,7 +81,7 @@ Some other notes
- pandas has been used extensively in production in financial applications.
-Data Structures
+Data structures
---------------
.. csv-table::
@@ -131,7 +131,7 @@ changed, but, for example, columns can be inserted into a DataFrame. However,
the vast majority of methods produce new objects and leave the input data
untouched. In general we like to **favor immutability** where sensible.
-Getting Support
+Getting support
---------------
The first stop for pandas issues and ideas is the `Github Issue Tracker
@@ -152,7 +152,7 @@ pandas is a `NumFOCUS `__ sponso
This will help ensure the success of development of pandas as a world-class open-source
project, and makes it possible to `donate `__ to the project.
-Project Governance
+Project governance
------------------
The governance process that pandas project has used informally since its inception in 2008 is formalized in `Project Governance documents `__.
@@ -160,13 +160,13 @@ The documents clarify how decisions are made and how the various elements of our
Wes McKinney is the Benevolent Dictator for Life (BDFL).
-Development Team
+Development team
-----------------
The list of the Core Team members and more detailed information can be found on the `people’s page `__ of the governance repo.
-Institutional Partners
+Institutional partners
----------------------
The information about current institutional partners can be found on `pandas website page `__.
diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst
index 8e23c643280c1..212f3636d0a98 100644
--- a/doc/source/getting_started/tutorials.rst
+++ b/doc/source/getting_started/tutorials.rst
@@ -8,7 +8,7 @@ Tutorials
This is a guide to many pandas tutorials, geared mainly for new users.
-Internal Guides
+Internal guides
===============
pandas' own :ref:`10 Minutes to pandas<10min>`.
@@ -17,7 +17,7 @@ More complex recipes are in the :ref:`Cookbook`.
A handy pandas `cheat sheet `_.
-Community Guides
+Community guides
================
pandas Cookbook by Julia Evans
@@ -74,7 +74,7 @@ Excel charts with pandas, vincent and xlsxwriter
* `Using Pandas and XlsxWriter to create Excel charts `_
-Video Tutorials
+Video tutorials
---------------
* `Pandas From The Ground Up `_
@@ -96,7 +96,7 @@ Video Tutorials
`Jupyter Notebook `__
-Various Tutorials
+Various tutorials
-----------------
* `Wes McKinney's (pandas BDFL) blog `_
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index f18c61b5e2f95..f5669626aa2b3 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -38,9 +38,8 @@ See the :ref:`overview` for more detail about what's in the library.
:maxdepth: 3
:hidden:
{% endif %}
-
- {% if not single_doc -%}
- What's New in 0.25.0
+{% if not single_doc %}
+ What's New in 1.0.0
install
getting_started/index
user_guide/index
@@ -52,10 +51,9 @@ See the :ref:`overview` for more detail about what's in the library.
{% if not single_doc -%}
development/index
whatsnew/index
-{% endif -%}
-
+{% endif %}
-* :doc:`whatsnew/v0.25.0`
+* :doc:`whatsnew/v1.0.0`
* :doc:`install`
* :doc:`getting_started/index`
diff --git a/doc/source/install.rst b/doc/source/install.rst
index 98443ede2e965..fc99b458fa0af 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -15,35 +15,10 @@ Instructions for installing from source,
`PyPI `__, `ActivePython `__, various Linux distributions, or a
`development version `__ are also provided.
-.. _install.dropping-27:
-
-Plan for dropping Python 2.7
-----------------------------
-
-The Python core team plans to stop supporting Python 2.7 on January 1st, 2020.
-In line with `NumPy's plans`_, all pandas releases through December 31, 2018
-will support Python 2.
-
-The 0.24.x feature release will be the last release to
-support Python 2. The released package will continue to be available on
-PyPI and through conda.
-
- Starting **January 1, 2019**, all new feature releases (> 0.24) will be Python 3 only.
-
-If there are people interested in continued support for Python 2.7 past December
-31, 2018 (either backporting bug fixes or funding) please reach out to the
-maintainers on the issue tracker.
-
-For more information, see the `Python 3 statement`_ and the `Porting to Python 3 guide`_.
-
-.. _NumPy's plans: https://github.com/numpy/numpy/blob/master/doc/neps/nep-0014-dropping-python2.7-proposal.rst#plan-for-dropping-python-27-support
-.. _Python 3 statement: http://python3statement.org/
-.. _Porting to Python 3 guide: https://docs.python.org/3/howto/pyporting.html
-
Python version support
----------------------
-Officially Python 2.7, 3.5, 3.6, and 3.7.
+Officially Python 3.5.3 and above, 3.6, and 3.7.
Installing pandas
-----------------
@@ -220,17 +195,23 @@ installed), make sure you have `pytest
==================== 12130 passed, 12 skipped in 368.339 seconds =====================
+.. _install.dependencies:
+
Dependencies
------------
-* `setuptools `__: 24.2.0 or higher
-* `NumPy `__: 1.13.3 or higher
-* `python-dateutil `__: 2.5.0 or higher
-* `pytz `__: 2015.4 or higher
+================================================================ ==========================
+Package Minimum supported version
+================================================================ ==========================
+`setuptools `__ 24.2.0
+`NumPy `__ 1.13.3
+`python-dateutil `__ 2.6.1
+`pytz `__ 2017.2
+================================================================ ==========================
.. _install.recommended_dependencies:
-Recommended Dependencies
+Recommended dependencies
~~~~~~~~~~~~~~~~~~~~~~~~
* `numexpr `__: for accelerating certain numerical operations.
@@ -249,90 +230,74 @@ Recommended Dependencies
.. _install.optional_dependencies:
-Optional Dependencies
+Optional dependencies
~~~~~~~~~~~~~~~~~~~~~
-* `Cython `__: Only necessary to build development
- version. Version 0.28.2 or higher.
-* `SciPy `__: miscellaneous statistical functions, Version 0.19.0 or higher
-* `xarray `__: pandas like handling for > 2 dims. Version 0.8.2 or higher is recommended.
-* `PyTables `__: necessary for HDF5-based storage, Version 3.4.2 or higher
-* `pyarrow `__ (>= 0.9.0): necessary for feather-based storage.
-* `Apache Parquet `__, either `pyarrow `__ (>= 0.9.0) or `fastparquet `__ (>= 0.2.1) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support.
-* `SQLAlchemy `__: for SQL database support. Version 1.1.4 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are:
-
- * `psycopg2 `__: for PostgreSQL
- * `pymysql `__: for MySQL.
- * `SQLite `__: for SQLite, this is included in Python's standard library by default.
-
-* `matplotlib `__: for plotting, Version 2.2.2 or higher.
-* For Excel I/O:
-
- * `xlrd/xlwt `__: Excel reading (xlrd), version 1.0.0 or higher required, and writing (xlwt)
- * `openpyxl `__: openpyxl version 2.4.0
- for writing .xlsx files (xlrd >= 1.0.0)
- * `XlsxWriter `__: Alternative Excel writer
-
-* `Jinja2 `__: Template engine for conditional HTML formatting.
-* `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.8).
-* `blosc `__: for msgpack compression using ``blosc``
-* `gcsfs `__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
-* One of
- `qtpy `__ (requires PyQt or PySide),
- `PyQt5 `__,
- `PyQt4 `__,
- `xsel `__, or
- `xclip `__: necessary to use
- :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation.
-* `pandas-gbq
- `__:
- for Google BigQuery I/O. (pandas-gbq >= 0.8.0)
-
-* One of the following combinations of libraries is needed to use the
- top-level :func:`~pandas.read_html` function:
-
- .. versionchanged:: 0.23.0
-
- .. note::
-
- If using BeautifulSoup4 a minimum version of 4.4.1 is required
-
- * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is
- okay.)
- * `BeautifulSoup4`_ and `lxml`_
- * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_
- * Only `lxml`_, although see :ref:`HTML Table Parsing `
- for reasons as to why you should probably **not** take this approach.
-
- .. warning::
-
- * if you install `BeautifulSoup4`_ you must install either
- `lxml`_ or `html5lib`_ or both.
- :func:`~pandas.read_html` will **not** work with *only*
- `BeautifulSoup4`_ installed.
- * You are highly encouraged to read :ref:`HTML Table Parsing gotchas `.
- It explains issues surrounding the installation and
- usage of the above three libraries.
-
- .. note::
-
- * if you're on a system with ``apt-get`` you can do
-
- .. code-block:: sh
-
- sudo apt-get build-dep python-lxml
-
- to get the necessary dependencies for installation of `lxml`_. This
- will prevent further headaches down the line.
-
+Pandas has many optional dependencies that are only used for specific methods.
+For example, :func:`pandas.read_hdf` requires the ``pytables`` package. If the
+optional dependency is not installed, pandas will raise an ``ImportError`` when
+the method requiring that dependency is called.
+
+========================= ================== =============================================================
+Dependency Minimum Version Notes
+========================= ================== =============================================================
+BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref:`note `)
+Jinja2 Conditional formatting with DataFrame.style
+PyQt4 Clipboard I/O
+PyQt5 Clipboard I/O
+PyTables 3.4.2 HDF5-based reading / writing
+SQLAlchemy 1.1.4 SQL support for databases other than sqlite
+SciPy 0.19.0 Miscellaneous statistical functions
+XLsxWriter 0.9.8 Excel writing
+blosc Compression for msgpack
+fastparquet 0.2.1 Parquet reading / writing
+gcsfs 0.2.2 Google Cloud Storage access
+html5lib HTML parser for read_html (see :ref:`note `)
+lxml 3.8.0 HTML parser for read_html (see :ref:`note `)
+matplotlib 2.2.2 Visualization
+openpyxl 2.4.8 Reading / writing for xlsx files
+pandas-gbq 0.8.0 Google Big Query access
+psycopg2 PostgreSQL engine for sqlalchemy
+pyarrow 0.9.0 Parquet and feather reading / writing
+pymysql 0.7.11 MySQL engine for sqlalchemy
+pyreadstat SPSS files (.sav) reading
+pytables 3.4.2 HDF5 reading / writing
+qtpy Clipboard I/O
+s3fs 0.0.8 Amazon S3 access
+xarray 0.8.2 pandas-like API for N-dimensional data
+xclip Clipboard I/O on linux
+xlrd 1.1.0 Excel reading
+xlwt 1.2.0 Excel writing
+xsel Clipboard I/O on linux
+zlib Compression for msgpack
+========================= ================== =============================================================
+
+.. _optional_html:
+
+Optional dependencies for parsing HTML
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One of the following combinations of libraries is needed to use the
+top-level :func:`~pandas.read_html` function:
+
+.. versionchanged:: 0.23.0
+
+* `BeautifulSoup4`_ and `html5lib`_
+* `BeautifulSoup4`_ and `lxml`_
+* `BeautifulSoup4`_ and `html5lib`_ and `lxml`_
+* Only `lxml`_, although see :ref:`HTML Table Parsing `
+ for reasons as to why you should probably **not** take this approach.
+
+.. warning::
+
+ * if you install `BeautifulSoup4`_ you must install either
+ `lxml`_ or `html5lib`_ or both.
+ :func:`~pandas.read_html` will **not** work with *only*
+ `BeautifulSoup4`_ installed.
+ * You are highly encouraged to read :ref:`HTML Table Parsing gotchas `.
+ It explains issues surrounding the installation and
+ usage of the above three libraries.
.. _html5lib: https://github.com/html5lib/html5lib-python
.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup
.. _lxml: http://lxml.de
-
-.. note::
-
- Without the optional dependencies, many useful features will not
- work. Hence, it is highly recommended that you install these. A packaged
- distribution like `Anaconda `__, `ActivePython `__ (version 2.7 or 3.5), or `Enthought Canopy
- `__ may be worth considering.
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index fb9a95b6736d5..7f464bf952bfb 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -3,7 +3,7 @@
.. _api.arrays:
=============
-Pandas Arrays
+Pandas arrays
=============
.. currentmodule:: pandas
@@ -37,7 +37,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra
.. _api.arrays.datetime:
-Datetime Data
+Datetime data
-------------
NumPy cannot natively represent timezone-aware datetimes. Pandas supports this
@@ -144,6 +144,7 @@ If the data are tz-aware, then every value in the array must have the same timez
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
arrays.DatetimeArray
@@ -155,7 +156,7 @@ If the data are tz-aware, then every value in the array must have the same timez
.. _api.arrays.timedelta:
-Timedelta Data
+Timedelta data
--------------
NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta`
@@ -204,12 +205,13 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`.
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
arrays.TimedeltaArray
.. _api.arrays.period:
-Timespan Data
+Timespan data
-------------
Pandas represents spans of times as :class:`Period` objects.
@@ -263,6 +265,7 @@ Every period in a ``PeriodArray`` must have the same ``freq``.
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
arrays.PeriodArray
@@ -274,7 +277,7 @@ Every period in a ``PeriodArray`` must have the same ``freq``.
.. _api.arrays.interval:
-Interval Data
+Interval data
-------------
Arbitrary intervals can be represented as :class:`Interval` objects.
@@ -292,6 +295,7 @@ Properties
Interval.closed
Interval.closed_left
Interval.closed_right
+ Interval.is_empty
Interval.left
Interval.length
Interval.mid
@@ -304,6 +308,7 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`.
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
arrays.IntervalArray
@@ -313,9 +318,34 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`.
IntervalDtype
+
+.. Those attributes and methods are included in the API because the docstrings
+.. of IntervalIndex and IntervalArray are shared. Including it here to make
+.. sure a docstring page is built for them to avoid warnings
+
+..
+ .. autosummary::
+ :toctree: api/
+
+ arrays.IntervalArray.left
+ arrays.IntervalArray.right
+ arrays.IntervalArray.closed
+ arrays.IntervalArray.mid
+ arrays.IntervalArray.length
+ arrays.IntervalArray.is_empty
+ arrays.IntervalArray.is_non_overlapping_monotonic
+ arrays.IntervalArray.from_arrays
+ arrays.IntervalArray.from_tuples
+ arrays.IntervalArray.from_breaks
+ arrays.IntervalArray.contains
+ arrays.IntervalArray.overlaps
+ arrays.IntervalArray.set_closed
+ arrays.IntervalArray.to_tuples
+
+
.. _api.arrays.integer_na:
-Nullable Integer
+Nullable integer
----------------
:class:`numpy.ndarray` cannot natively represent integer-data with missing values.
@@ -323,6 +353,7 @@ Pandas provides this through :class:`arrays.IntegerArray`.
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
arrays.IntegerArray
@@ -341,7 +372,7 @@ Pandas provides this through :class:`arrays.IntegerArray`.
.. _api.arrays.categorical:
-Categorical Data
+Categorical data
----------------
Pandas defines a custom data type for representing data that can take only a
@@ -406,7 +437,7 @@ data. See :ref:`api.series.cat` for more.
.. _api.arrays.sparse:
-Sparse Data
+Sparse data
-----------
Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may
@@ -414,6 +445,7 @@ be stored efficiently as a :class:`SparseArray`.
.. autosummary::
:toctree: api/
+ :template: autosummary/class_without_autosummary.rst
SparseArray
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index 6146e34fab274..4b1a99da7cd4c 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -18,5 +18,44 @@ objects.
api.extensions.register_series_accessor
api.extensions.register_index_accessor
api.extensions.ExtensionDtype
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/class_without_autosummary.rst
+
api.extensions.ExtensionArray
arrays.PandasArray
+
+.. We need this autosummary so that methods and attributes are generated.
+.. Separate block, since they aren't classes.
+
+ .. autosummary::
+ :toctree: api/
+
+ api.extensions.ExtensionArray._concat_same_type
+ api.extensions.ExtensionArray._formatter
+ api.extensions.ExtensionArray._from_factorized
+ api.extensions.ExtensionArray._from_sequence
+ api.extensions.ExtensionArray._from_sequence_of_strings
+ api.extensions.ExtensionArray._ndarray_values
+ api.extensions.ExtensionArray._reduce
+ api.extensions.ExtensionArray._values_for_argsort
+ api.extensions.ExtensionArray._values_for_factorize
+ api.extensions.ExtensionArray.argsort
+ api.extensions.ExtensionArray.astype
+ api.extensions.ExtensionArray.copy
+ api.extensions.ExtensionArray.view
+ api.extensions.ExtensionArray.dropna
+ api.extensions.ExtensionArray.factorize
+ api.extensions.ExtensionArray.fillna
+ api.extensions.ExtensionArray.isna
+ api.extensions.ExtensionArray.ravel
+ api.extensions.ExtensionArray.repeat
+ api.extensions.ExtensionArray.searchsorted
+ api.extensions.ExtensionArray.shift
+ api.extensions.ExtensionArray.take
+ api.extensions.ExtensionArray.unique
+ api.extensions.ExtensionArray.dtype
+ api.extensions.ExtensionArray.nbytes
+ api.extensions.ExtensionArray.ndim
+ api.extensions.ExtensionArray.shape
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
index 7d5cd5d245631..b1c6172fb1261 100644
--- a/doc/source/reference/frame.rst
+++ b/doc/source/reference/frame.rst
@@ -67,8 +67,8 @@ Indexing, iteration
DataFrame.insert
DataFrame.__iter__
DataFrame.items
- DataFrame.keys
DataFrame.iteritems
+ DataFrame.keys
DataFrame.iterrows
DataFrame.itertuples
DataFrame.lookup
@@ -115,7 +115,7 @@ Binary operator functions
DataFrame.combine
DataFrame.combine_first
-Function application, GroupBy & Window
+Function application, GroupBy & window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -133,7 +133,7 @@ Function application, GroupBy & Window
.. _api.dataframe.stats:
-Computations / Descriptive Stats
+Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -177,7 +177,7 @@ Computations / Descriptive Stats
DataFrame.var
DataFrame.nunique
-Reindexing / Selection / Label manipulation
+Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -198,7 +198,6 @@ Reindexing / Selection / Label manipulation
DataFrame.idxmin
DataFrame.last
DataFrame.reindex
- DataFrame.reindex_axis
DataFrame.reindex_like
DataFrame.rename
DataFrame.rename_axis
@@ -240,6 +239,7 @@ Reshaping, sorting, transposing
DataFrame.unstack
DataFrame.swapaxes
DataFrame.melt
+ DataFrame.explode
DataFrame.squeeze
DataFrame.to_xarray
DataFrame.T
@@ -312,7 +312,7 @@ specific plotting methods of the form ``DataFrame.plot.``.
.. _api.frame.sparse:
-Sparse Accessor
+Sparse accessor
~~~~~~~~~~~~~~~
Sparse-dtype specific methods and attributes are provided under the
@@ -332,12 +332,11 @@ Sparse-dtype specific methods and attributes are provided under the
DataFrame.sparse.to_dense
-Serialization / IO / Conversion
+Serialization / IO / conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
- DataFrame.from_csv
DataFrame.from_dict
DataFrame.from_items
DataFrame.from_records
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index 5c8a563a47d00..921eb737aef07 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -40,7 +40,7 @@ Function application
GroupBy.transform
GroupBy.pipe
-Computations / Descriptive Stats
+Computations / descriptive stats
--------------------------------
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 31b493e472099..12ca318c815d3 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -3,7 +3,7 @@
.. _api:
=============
-API Reference
+API reference
=============
This page gives an overview of all public pandas objects, functions and
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
index 42ebf648f299f..576f734d517aa 100644
--- a/doc/source/reference/indexing.rst
+++ b/doc/source/reference/indexing.rst
@@ -3,7 +3,7 @@
.. _api.indexing:
=============
-Index Objects
+Index objects
=============
Index
@@ -48,7 +48,7 @@ Properties
Index.T
Index.memory_usage
-Modifying and Computations
+Modifying and computations
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -96,7 +96,7 @@ Compatibility with MultiIndex
Index.is_lexsorted_for_tuple
Index.droplevel
-Missing Values
+Missing values
~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -190,6 +190,9 @@ Numeric Index
.. autosummary::
:toctree: api/
+ RangeIndex.start
+ RangeIndex.stop
+ RangeIndex.step
RangeIndex.from_range
.. _api.categoricalindex:
@@ -202,7 +205,7 @@ CategoricalIndex
CategoricalIndex
-Categorical Components
+Categorical components
~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -219,7 +222,7 @@ Categorical Components
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
-Modifying and Computations
+Modifying and computations
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -237,7 +240,7 @@ IntervalIndex
IntervalIndex
-IntervalIndex Components
+IntervalIndex components
~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -245,18 +248,19 @@ IntervalIndex Components
IntervalIndex.from_arrays
IntervalIndex.from_tuples
IntervalIndex.from_breaks
- IntervalIndex.contains
IntervalIndex.left
IntervalIndex.right
IntervalIndex.mid
IntervalIndex.closed
IntervalIndex.length
IntervalIndex.values
+ IntervalIndex.is_empty
IntervalIndex.is_non_overlapping_monotonic
IntervalIndex.is_overlapping
IntervalIndex.get_loc
IntervalIndex.get_indexer
IntervalIndex.set_closed
+ IntervalIndex.contains
IntervalIndex.overlaps
IntervalIndex.to_tuples
@@ -275,7 +279,7 @@ MultiIndex
IndexSlice
-MultiIndex Constructors
+MultiIndex constructors
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -285,7 +289,7 @@ MultiIndex Constructors
MultiIndex.from_product
MultiIndex.from_frame
-MultiIndex Properties
+MultiIndex properties
~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -296,7 +300,7 @@ MultiIndex Properties
MultiIndex.nlevels
MultiIndex.levshape
-MultiIndex Components
+MultiIndex components
~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -313,7 +317,7 @@ MultiIndex Components
MultiIndex.reorder_levels
MultiIndex.remove_unused_levels
-MultiIndex Selecting
+MultiIndex selecting
~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -333,7 +337,7 @@ DatetimeIndex
DatetimeIndex
-Time/Date Components
+Time/Date components
~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -403,6 +407,13 @@ Conversion
DatetimeIndex.to_series
DatetimeIndex.to_frame
+Methods
+~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ DatetimeIndex.mean
+
TimedeltaIndex
--------------
.. autosummary::
@@ -435,6 +446,13 @@ Conversion
TimedeltaIndex.ceil
TimedeltaIndex.to_frame
+Methods
+~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ TimedeltaIndex.mean
+
.. currentmodule:: pandas
PeriodIndex
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
index 9c776e3ff8a82..91f4942d03b0d 100644
--- a/doc/source/reference/io.rst
+++ b/doc/source/reference/io.rst
@@ -3,7 +3,7 @@
.. _api.io:
============
-Input/Output
+Input/output
============
.. currentmodule:: pandas
@@ -14,7 +14,7 @@ Pickling
read_pickle
-Flat File
+Flat file
~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -105,6 +105,13 @@ SAS
read_sas
+SPSS
+~~~~
+.. autosummary::
+ :toctree: api/
+
+ read_spss
+
SQL
~~~
.. autosummary::
diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst
index ccc1c7e171d22..4a58055f1c955 100644
--- a/doc/source/reference/offset_frequency.rst
+++ b/doc/source/reference/offset_frequency.rst
@@ -3,7 +3,7 @@
.. _api.dateoffsets:
============
-Date Offsets
+Date offsets
============
.. currentmodule:: pandas.tseries.offsets
diff --git a/doc/source/reference/resampling.rst b/doc/source/reference/resampling.rst
index 2a52defa3c68f..57263139d9c18 100644
--- a/doc/source/reference/resampling.rst
+++ b/doc/source/reference/resampling.rst
@@ -43,7 +43,7 @@ Upsampling
Resampler.asfreq
Resampler.interpolate
-Computations / Descriptive Stats
+Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index 79beeb0022307..7ba625c141f24 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -76,8 +76,8 @@ Indexing, iteration
Series.loc
Series.iloc
Series.__iter__
- Series.iteritems
Series.items
+ Series.iteritems
Series.keys
Series.pop
Series.item
@@ -119,7 +119,7 @@ Binary operator functions
Series.product
Series.dot
-Function application, GroupBy & Window
+Function application, groupby & window
--------------------------------------
.. autosummary::
:toctree: api/
@@ -137,7 +137,7 @@ Function application, GroupBy & Window
.. _api.series.stats:
-Computations / Descriptive Stats
+Computations / descriptive stats
--------------------------------
.. autosummary::
:toctree: api/
@@ -188,7 +188,7 @@ Computations / Descriptive Stats
Series.value_counts
Series.compound
-Reindexing / Selection / Label manipulation
+Reindexing / selection / label manipulation
-------------------------------------------
.. autosummary::
:toctree: api/
@@ -245,6 +245,7 @@ Reshaping, sorting
Series.sort_index
Series.swaplevel
Series.unstack
+ Series.explode
Series.searchsorted
Series.ravel
Series.repeat
@@ -296,14 +297,14 @@ Sparse :ref:`sparse `
.. _api.series.dt:
-Datetimelike Properties
+Datetimelike properties
~~~~~~~~~~~~~~~~~~~~~~~
``Series.dt`` can be used to access the values of the series as
datetimelike and return several properties.
These can be accessed like ``Series.dt.``.
-Datetime Properties
+Datetime properties
^^^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -339,7 +340,7 @@ Datetime Properties
Series.dt.tz
Series.dt.freq
-Datetime Methods
+Datetime methods
^^^^^^^^^^^^^^^^
.. autosummary::
@@ -358,7 +359,7 @@ Datetime Methods
Series.dt.month_name
Series.dt.day_name
-Period Properties
+Period properties
^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -369,7 +370,7 @@ Period Properties
Series.dt.start_time
Series.dt.end_time
-Timedelta Properties
+Timedelta properties
^^^^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -382,7 +383,7 @@ Timedelta Properties
Series.dt.nanoseconds
Series.dt.components
-Timedelta Methods
+Timedelta methods
^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -472,11 +473,13 @@ strings and apply several methods to it. These can be accessed like
Series.str
Series.cat
Series.dt
+ Series.sparse
+ DataFrame.sparse
Index.str
.. _api.series.cat:
-Categorical Accessor
+Categorical accessor
~~~~~~~~~~~~~~~~~~~~
Categorical-dtype specific methods and attributes are available under
@@ -506,7 +509,7 @@ the ``Series.cat`` accessor.
.. _api.series.sparse:
-Sparse Accessor
+Sparse accessor
~~~~~~~~~~~~~~~
Sparse-dtype specific methods and attributes are provided under the
@@ -558,7 +561,7 @@ specific plotting methods of the form ``Series.plot.``.
Series.hist
-Serialization / IO / Conversion
+Serialization / IO / conversion
-------------------------------
.. autosummary::
:toctree: api/
@@ -588,4 +591,3 @@ Sparse
SparseSeries.to_coo
SparseSeries.from_coo
-
diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst
index bd9635b41e343..3d155535e2585 100644
--- a/doc/source/reference/style.rst
+++ b/doc/source/reference/style.rst
@@ -9,7 +9,7 @@ Style
``Styler`` objects are returned by :attr:`pandas.DataFrame.style`.
-Styler Constructor
+Styler constructor
------------------
.. autosummary::
:toctree: api/
@@ -17,7 +17,7 @@ Styler Constructor
Styler
Styler.from_custom_template
-Styler Properties
+Styler properties
-----------------
.. autosummary::
:toctree: api/
@@ -26,7 +26,7 @@ Styler Properties
Styler.template
Styler.loader
-Style Application
+Style application
-----------------
.. autosummary::
:toctree: api/
@@ -44,7 +44,7 @@ Style Application
Styler.clear
Styler.pipe
-Builtin Styles
+Builtin styles
--------------
.. autosummary::
:toctree: api/
@@ -55,7 +55,7 @@ Builtin Styles
Styler.background_gradient
Styler.bar
-Style Export and Import
+Style export and import
-----------------------
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
index 9e1374a3bd8e4..2f6addf607877 100644
--- a/doc/source/reference/window.rst
+++ b/doc/source/reference/window.rst
@@ -5,7 +5,6 @@
======
Window
======
-.. currentmodule:: pandas.core.window
Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc.
Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc.
@@ -13,6 +12,8 @@ EWM objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:
Standard moving window functions
--------------------------------
+.. currentmodule:: pandas.core.window.rolling
+
.. autosummary::
:toctree: api/
@@ -38,6 +39,8 @@ Standard moving window functions
Standard expanding window functions
-----------------------------------
+.. currentmodule:: pandas.core.window.expanding
+
.. autosummary::
:toctree: api/
@@ -59,6 +62,8 @@ Standard expanding window functions
Exponentially-weighted moving window functions
----------------------------------------------
+.. currentmodule:: pandas.core.window.ewm
+
.. autosummary::
:toctree: api/
diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html
index b3f13f99f44d4..6e7d8ece35133 100644
--- a/doc/source/themes/nature_with_gtoc/layout.html
+++ b/doc/source/themes/nature_with_gtoc/layout.html
@@ -94,15 +94,15 @@ {{ _('Search') }}
});
});
-
+
+
+
{% endblock %}
diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
index 0e68cddde8bc7..62a9b6396404a 100644
--- a/doc/source/user_guide/advanced.rst
+++ b/doc/source/user_guide/advanced.rst
@@ -3,7 +3,7 @@
{{ header }}
******************************
-MultiIndex / Advanced Indexing
+MultiIndex / advanced indexing
******************************
This section covers :ref:`indexing with a MultiIndex `
@@ -179,18 +179,18 @@ on a deeper level.
.. _advanced.shown_levels:
-Defined Levels
+Defined levels
~~~~~~~~~~~~~~
-The repr of a ``MultiIndex`` shows all the defined levels of an index, even
+The :class:`MultiIndex` keeps all the defined levels of an index, even
if they are not actually used. When slicing an index, you may notice this.
For example:
.. ipython:: python
- df.columns # original MultiIndex
+ df.columns.levels # original MultiIndex
- df[['foo','qux']].columns # sliced
+ df[['foo','qux']].columns.levels # sliced
This is done to avoid a recomputation of the levels in order to make slicing
highly performant. If you want to see only the used levels, you can use the
@@ -210,7 +210,8 @@ To reconstruct the ``MultiIndex`` with only the used levels, the
.. ipython:: python
- df[['foo', 'qux']].columns.remove_unused_levels()
+ new_mi = df[['foo', 'qux']].columns.remove_unused_levels()
+ new_mi.levels
Data alignment and using ``reindex``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -641,7 +642,7 @@ And now selection works as expected.
dfm.loc[(0, 'y'):(1, 'z')]
-Take Methods
+Take methods
------------
.. _advanced.take:
@@ -703,13 +704,15 @@ faster than fancy indexing.
%timeit arr[indexer]
%timeit arr.take(indexer, axis=0)
+.. ipython:: python
+
ser = pd.Series(arr[:, 0])
%timeit ser.iloc[indexer]
%timeit ser.take(indexer)
.. _indexing.index_types:
-Index Types
+Index types
-----------
We have discussed ``MultiIndex`` in the previous sections pretty extensively.
@@ -735,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
- df.B.cat.categories
+ df['B'].cat.categories
Setting the index will create a ``CategoricalIndex``.
@@ -807,15 +810,10 @@ values **not** in the categories, similarly to how you can reindex **any** panda
Int64Index and RangeIndex
~~~~~~~~~~~~~~~~~~~~~~~~~
-.. warning::
-
- Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `.
+:class:`Int64Index` is a fundamental basic index in pandas. This is an immutable array
+implementing an ordered, sliceable set.
-:class:`Int64Index` is a fundamental basic index in pandas.
-This is an immutable array implementing an ordered, sliceable set.
-Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects.
-
-:class:`RangeIndex` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects.
+:class:`RangeIndex` is a sub-class of ``Int64Index`` that provides the default index for all ``NDFrame`` objects.
``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analogous to Python `range types `__.
.. _indexing.float64index:
@@ -877,16 +875,6 @@ In non-float indexes, slicing using floats will raise a ``TypeError``.
In [1]: pd.Series(range(5))[3.5:4.5]
TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index)
-.. warning::
-
- Using a scalar float indexer for ``.iloc`` has been removed in 0.18.0, so the following will raise a ``TypeError``:
-
- .. code-block:: ipython
-
- In [3]: pd.Series(range(5)).iloc[3.0]
- TypeError: cannot do positional indexing on with these indexers [3.0] of
-
-
Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat
irregular timedelta-like indexing scheme, but the data is recorded as floats. This could, for
example, be millisecond offsets.
@@ -935,9 +923,8 @@ for interval notation.
The ``IntervalIndex`` allows some unique indexing and is also used as a
return type for the categories in :func:`cut` and :func:`qcut`.
-.. warning::
-
- These indexing behaviors are provisional and may change in a future version of pandas.
+Indexing with an ``IntervalIndex``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index.
@@ -962,7 +949,34 @@ If you select a label *contained* within an interval, this will also select the
df.loc[2.5]
df.loc[[2.5, 3.5]]
-``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``:
+Selecting using an ``Interval`` will only return exact matches (starting from pandas 0.25.0).
+
+.. ipython:: python
+
+ df.loc[pd.Interval(1, 2)]
+
+Trying to select an ``Interval`` that is not exactly contained in the ``IntervalIndex`` will raise a ``KeyError``.
+
+.. code-block:: python
+
+ In [7]: df.loc[pd.Interval(0.5, 2.5)]
+ ---------------------------------------------------------------------------
+ KeyError: Interval(0.5, 2.5, closed='right')
+
+Selecting all ``Intervals`` that overlap a given ``Interval`` can be performed using the
+:meth:`~IntervalIndex.overlaps` method to create a boolean indexer.
+
+.. ipython:: python
+
+ idxr = df.index.overlaps(pd.Interval(0.5, 2.5))
+ idxr
+ df[idxr]
+
+Binning data with ``cut`` and ``qcut``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`cut` and :func:`qcut` both return a ``Categorical`` object, and the bins they
+create are stored as an ``IntervalIndex`` in its ``.categories`` attribute.
.. ipython:: python
@@ -970,15 +984,19 @@ If you select a label *contained* within an interval, this will also select the
c
c.categories
-Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same
-bins, with ``NaN`` representing a missing value similar to other dtypes.
+:func:`cut` also accepts an ``IntervalIndex`` for its ``bins`` argument, which enables
+a useful pandas idiom. First, We call :func:`cut` with some data and ``bins`` set to a
+fixed number, to generate the bins. Then, we pass the values of ``.categories`` as the
+``bins`` argument in subsequent calls to :func:`cut`, supplying new data which will be
+binned into the same bins.
.. ipython:: python
pd.cut([0, 3, 5, 1], bins=c.categories)
+Any value which falls outside all bins will be assigned a ``NaN`` value.
-Generating Ranges of Intervals
+Generating ranges of intervals
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If we need intervals on a regular frequency, we can use the :func:`interval_range` function
@@ -1105,6 +1123,8 @@ the :meth:`~Index.is_unique` attribute.
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique
+.. _advanced.endpoints_are_inclusive:
+
Endpoints are inclusive
~~~~~~~~~~~~~~~~~~~~~~~
@@ -1134,7 +1154,7 @@ index can be somewhat complicated. For example, the following does not work:
s.loc['c':'e' + 1]
A very common use case is to limit a time series to start and end at two
-specific dates. To enable this, we made the design to make label-based
+specific dates. To enable this, we made the design choice to make label-based
slicing include both endpoints:
.. ipython:: python
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
index a6315c548b382..8ca96ba0daa5e 100644
--- a/doc/source/user_guide/categorical.rst
+++ b/doc/source/user_guide/categorical.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-Categorical Data
+Categorical data
****************
This is an introduction to pandas categorical data type, including a short comparison
@@ -38,10 +38,10 @@ See also the :ref:`API docs on categoricals`.
.. _categorical.objectcreation:
-Object Creation
+Object creation
---------------
-Series Creation
+Series creation
~~~~~~~~~~~~~~~
Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways:
@@ -90,7 +90,7 @@ Categorical data has a specific ``category`` :ref:`dtype `:
df.dtypes
-DataFrame Creation
+DataFrame creation
~~~~~~~~~~~~~~~~~~
Similar to the previous section where a single column was converted to categorical, all columns in a
@@ -130,7 +130,7 @@ This conversion is likewise done column by column:
df_cat['B']
-Controlling Behavior
+Controlling behavior
~~~~~~~~~~~~~~~~~~~~
In the examples above where we passed ``dtype='category'``, we used the default
@@ -181,7 +181,7 @@ during normal constructor mode:
categories=["train", "test"]))
-Regaining Original Data
+Regaining original data
~~~~~~~~~~~~~~~~~~~~~~~
To get back to the original ``Series`` or NumPy array, use
@@ -243,7 +243,7 @@ expects a `dtype`. For example :func:`pandas.read_csv`,
array. In other words, ``dtype='category'`` is equivalent to
``dtype=CategoricalDtype()``.
-Equality Semantics
+Equality semantics
~~~~~~~~~~~~~~~~~~
Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal
@@ -438,7 +438,7 @@ use :meth:`~pandas.Categorical.set_categories`.
intentionally or because it is misspelled or (under Python3) due to a type difference (e.g.,
NumPy S1 dtype and Python strings). This can result in surprising behaviour!
-Sorting and Order
+Sorting and order
-----------------
.. _categorical.sort:
@@ -510,7 +510,7 @@ necessarily make the sort order the same as the categories order.
(e.g. :meth:`Series.median`, which would need to compute the mean between two values if the length
of an array is even) do not work and raise a ``TypeError``.
-Multi Column Sorting
+Multi column sorting
~~~~~~~~~~~~~~~~~~~~
A categorical dtyped column will participate in a multi-column sort in a similar manner to other columns.
@@ -834,8 +834,6 @@ See also the section on :ref:`merge dtypes` for notes about pres
Unioning
~~~~~~~~
-.. versionadded:: 0.19.0
-
If you want to combine categoricals that do not necessarily have the same
categories, the :func:`~pandas.api.types.union_categoricals` function will
combine a list-like of categoricals. The new categories will be the union of
@@ -963,7 +961,7 @@ Following table summarizes the results of ``Categoricals`` related concatenation
+----------+--------------------------------------------------------+----------------------------+
-Getting Data In/Out
+Getting data in/out
-------------------
You can write data that contains ``category`` dtypes to a ``HDFStore``.
@@ -1000,7 +998,7 @@ relevant columns back to `category` and assign the right categories and categori
The same holds for writing to a SQL database with ``to_sql``.
-Missing Data
+Missing data
------------
pandas primarily uses the value `np.nan` to represent missing data. It is by
@@ -1052,7 +1050,7 @@ Gotchas
.. _categorical.rfactor:
-Memory Usage
+Memory usage
~~~~~~~~~~~~
.. _categorical.memory:
@@ -1152,7 +1150,7 @@ You can use ``fillna`` to handle missing values before applying a function.
df.apply(lambda row: type(row["cats"]), axis=1)
df.apply(lambda col: col.dtype, axis=0)
-Categorical Index
+Categorical index
~~~~~~~~~~~~~~~~~
``CategoricalIndex`` is a type of index that is useful for supporting
@@ -1173,7 +1171,7 @@ Setting the index will create a ``CategoricalIndex``:
# This now sorts by the categories order
df.sort_index()
-Side Effects
+Side effects
~~~~~~~~~~~~
Constructing a ``Series`` from a ``Categorical`` will not copy the input
diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst
index 71cbf58dff871..cfce7c40c477f 100644
--- a/doc/source/user_guide/computation.rst
+++ b/doc/source/user_guide/computation.rst
@@ -5,12 +5,13 @@
Computational tools
===================
-Statistical Functions
+
+Statistical functions
---------------------
.. _computation.pct_change:
-Percent Change
+Percent change
~~~~~~~~~~~~~~
``Series`` and ``DataFrame`` have a method
@@ -294,7 +295,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns:
.. _stats.summary:
-Method Summary
+Method summary
~~~~~~~~~~~~~~
We provide a number of common statistical functions:
@@ -335,7 +336,7 @@ compute the mean absolute deviation on a rolling basis:
.. _stats.rolling_window:
-Rolling Windows
+Rolling windows
~~~~~~~~~~~~~~~
Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``.
@@ -404,12 +405,10 @@ For some windowing functions, additional parameters must be specified:
.. _stats.moments.ts:
-Time-aware Rolling
+Time-aware rolling
~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
-New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce
+It is possible to pass an offset (or convertible) to a ``.rolling()`` method and have it produce
variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring
within the indicated time delta.
@@ -469,7 +468,7 @@ default of the index) in a DataFrame.
.. _stats.rolling_window.endpoints:
-Rolling Window Endpoints
+Rolling window endpoints
~~~~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.20.0
@@ -511,7 +510,7 @@ For fixed windows, the closed parameter cannot be set and the rolling window wil
.. _stats.moments.ts-versus-resampling:
-Time-aware Rolling vs. Resampling
+Time-aware rolling vs. resampling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They
@@ -529,7 +528,7 @@ will have the shape of a regular frequency between the min and the max of the or
To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation.
-Centering Windows
+Centering windows
~~~~~~~~~~~~~~~~~
By default the labels are set to the right edge of the window, but a
@@ -542,7 +541,7 @@ By default the labels are set to the right edge of the window, but a
.. _stats.moments.binary:
-Binary Window Functions
+Binary window functions
~~~~~~~~~~~~~~~~~~~~~~~
:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about
@@ -695,7 +694,7 @@ Furthermore you can pass a nested dict to indicate different aggregations on dif
.. _stats.moments.expanding:
-Expanding Windows
+Expanding windows
-----------------
A common alternative to rolling statistics is to use an *expanding* window,
@@ -716,7 +715,7 @@ they are implemented in pandas such that the following two calls are equivalent:
These have a similar set of methods to ``.rolling`` methods.
-Method Summary
+Method summary
~~~~~~~~~~~~~~
.. currentmodule:: pandas.core.window
@@ -798,7 +797,7 @@ relative impact of an individual data point. As an example, here is the
.. _stats.moments.exponentially_weighted:
-Exponentially Weighted Windows
+Exponentially weighted windows
------------------------------
.. currentmodule:: pandas.core.window
@@ -892,10 +891,9 @@ Therefore, there is an assumption that :math:`x_0` is not an ordinary value
but rather an exponentially weighted moment of the infinite series up to that
point.
-One must have :math:`0 < \alpha \leq 1`, and while since version 0.18.0
-it has been possible to pass :math:`\alpha` directly, it's often easier
-to think about either the **span**, **center of mass (com)** or **half-life**
-of an EW moment:
+One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass
+:math:`\alpha` directly, it's often easier to think about either the
+**span**, **center of mass (com)** or **half-life** of an EW moment:
.. math::
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 538acbd7d01fa..c9d3bc3a28c70 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -99,7 +99,7 @@ Splitting
df[df.AAA <= 5]
df[df.AAA > 5]
-Building Criteria
+Building criteria
*****************
`Select with multi-column criteria
@@ -245,7 +245,7 @@ Ambiguity arises when an index consists of integers with a non-zero start or non
df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))]
-New Columns
+New columns
***********
`Efficiently and dynamically creating new columns using applymap
@@ -399,7 +399,7 @@ Sorting
df.sort_values(by=('Labs', 'II'), ascending=False)
-`Partial Selection, the need for sortedness;
+`Partial selection, the need for sortedness;
`__
Levels
@@ -413,7 +413,7 @@ Levels
.. _cookbook.missing_data:
-Missing Data
+Missing data
------------
The :ref:`missing data` docs.
@@ -485,7 +485,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
expected_df = gb.apply(GrowUp)
expected_df
-`Expanding Apply
+`Expanding apply
`__
.. ipython:: python
@@ -592,10 +592,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
.. ipython:: python
df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A'])
- df.A.groupby((df.A != df.A.shift()).cumsum()).groups
- df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum()
+ df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups
+ df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum()
-Expanding Data
+Expanding data
**************
`Alignment and to-date
@@ -690,7 +690,7 @@ To create year and month cross tabulation:
Apply
*****
-`Rolling Apply to Organize - Turning embedded lists into a MultiIndex frame
+`Rolling apply to organize - Turning embedded lists into a MultiIndex frame
`__
.. ipython:: python
@@ -706,7 +706,7 @@ Apply
for ind, row in df.iterrows()})
df_orgz
-`Rolling Apply with a DataFrame returning a Series
+`Rolling apply with a DataFrame returning a Series
`__
Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned
@@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc
df
def gm(df, const):
- v = ((((df.A + df.B) + 1).cumprod()) - 1) * const
+ v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const
return v.iloc[-1]
s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5)
@@ -1099,7 +1099,7 @@ HDFStore
The :ref:`HDFStores ` docs
-`Simple Queries with a Timestamp Index
+`Simple queries with a Timestamp Index
`__
`Managing heterogeneous data using a linked multiple table hierarchy
@@ -1169,7 +1169,7 @@ Storing Attributes to a group node
.. _cookbook.binary:
-Binary Files
+Binary files
************
pandas readily accepts NumPy record arrays, if you need to read in a binary
@@ -1260,24 +1260,19 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition
n = len(x)
a = np.zeros(shape=(n, n))
b = np.zeros(shape=(n, n))
-
for i in range(n):
for j in range(i + 1, n):
a[i, j] = abs(x[i] - x[j])
b[i, j] = abs(y[i] - y[j])
-
a += a.T
b += b.T
-
a_bar = np.vstack([np.nanmean(a, axis=0)] * n)
b_bar = np.vstack([np.nanmean(b, axis=0)] * n)
-
A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean())
B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean())
cov_ab = np.sqrt(np.nansum(A * B)) / n
std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n)
std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n)
-
return cov_ab / std_a / std_b
df = pd.DataFrame(np.random.normal(size=(100, 3)))
@@ -1339,7 +1334,7 @@ Values can be set to NaT using np.nan, similar to datetime
y[1] = np.nan
y
-Aliasing Axis Names
+Aliasing axis names
-------------------
To globally provide aliases for axis names, one can define these 2 functions:
@@ -1366,7 +1361,7 @@ To globally provide aliases for axis names, one can define these 2 functions:
df2.sum(axis='myaxis2')
clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
-Creating Example Data
+Creating example data
---------------------
To create a dataframe from every combination of some given values, like R's ``expand.grid()``
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index 525f9abb1d1ae..2df5b9d82dcc3 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -3,7 +3,7 @@
{{ header }}
*********************
-Enhancing Performance
+Enhancing performance
*********************
In this part of the tutorial, we will investigate how to speed up certain
@@ -15,7 +15,7 @@ when we use Cython and Numba on a test function operating row-wise on the
.. _enhancingperf.cython:
-Cython (Writing C extensions for pandas)
+Cython (writing C extensions for pandas)
----------------------------------------
For many use cases writing pandas in pure Python and NumPy is sufficient. In some
@@ -33,7 +33,7 @@ faster than the pure Python solution.
.. _enhancingperf.pure:
-Pure python
+Pure Python
~~~~~~~~~~~
We have a ``DataFrame`` to which we want to apply a function row-wise.
@@ -243,9 +243,9 @@ We've gotten another big improvement. Let's check again where the time is spent:
.. ipython:: python
- %prun -l 4 apply_integrate_f(df['a'].to_numpy(),
- df['b'].to_numpy(),
- df['N'].to_numpy())
+ %%prun -l 4 apply_integrate_f(df['a'].to_numpy(),
+ df['b'].to_numpy(),
+ df['N'].to_numpy())
As one might expect, the majority of the time is now spent in ``apply_integrate_f``,
so if we wanted to make anymore efficiencies we must continue to concentrate our
@@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation:
.. code-block:: ipython
# Custom function without numba
- In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501
+ In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501
1000 loops, best of 3: 797 us per loop
# Standard implementation (faster than a custom function)
- In [6]: %timeit df['col1_doubled'] = df.a * 2
+ In [6]: %timeit df['col1_doubled'] = df['a'] * 2
1000 loops, best of 3: 233 us per loop
# Custom function with numba
- In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy())
+ In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
1000 loops, best of 3: 145 us per loop
Caveats
@@ -429,7 +429,7 @@ Read more in the `Numba docs `__.
.. _enhancingperf.eval:
-Expression Evaluation via :func:`~pandas.eval`
+Expression evaluation via :func:`~pandas.eval`
-----------------------------------------------
The top-level function :func:`pandas.eval` implements expression evaluation of
@@ -465,7 +465,7 @@ engine in addition to some extensions available only in pandas.
The larger the frame and the larger the expression the more speedup you will
see from using :func:`~pandas.eval`.
-Supported Syntax
+Supported syntax
~~~~~~~~~~~~~~~~
These operations are supported by :func:`pandas.eval`:
@@ -505,7 +505,7 @@ This Python syntax is **not** allowed:
-:func:`~pandas.eval` Examples
+:func:`~pandas.eval` examples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`pandas.eval` works well with expressions containing large arrays.
@@ -601,8 +601,6 @@ This allows for *formulaic evaluation*. The assignment target can be a
new column name or an existing column name, and it must be a valid Python
identifier.
-.. versionadded:: 0.18.0
-
The ``inplace`` keyword determines whether this assignment will performed
on the original ``DataFrame`` or return a copy with the new column.
@@ -630,8 +628,6 @@ new or modified columns is returned and the original frame is unchanged.
df.eval('e = a - c', inplace=False)
df
-.. versionadded:: 0.18.0
-
As a convenience, multiple assignments can be performed by using a
multi-line string.
@@ -647,14 +643,12 @@ The equivalent in standard Python would be
.. ipython:: python
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
- df['c'] = df.a + df.b
- df['d'] = df.a + df.b + df.c
+ df['c'] = df['a'] + df['b']
+ df['d'] = df['a'] + df['b'] + df['c']
df['a'] = 1
df
-.. versionadded:: 0.18.0
-
-The ``query`` method gained the ``inplace`` keyword which determines
+The ``query`` method has a ``inplace`` keyword which determines
whether the query modifies the original frame.
.. ipython:: python
@@ -669,7 +663,7 @@ whether the query modifies the original frame.
Unlike with ``eval``, the default value for ``inplace`` for ``query``
is ``False``. This is consistent with prior versions of pandas.
-Local Variables
+Local variables
~~~~~~~~~~~~~~~
You must *explicitly reference* any local variable that you want to use in an
@@ -694,7 +688,7 @@ name in an expression.
a = np.random.randn()
df.query('@a < a')
- df.loc[a < df.a] # same as the previous expression
+ df.loc[a < df['a']] # same as the previous expression
With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it
isn't defined in that context. ``pandas`` will let you know this if you try to
@@ -714,7 +708,7 @@ standard Python.
pd.eval('a + b')
-:func:`pandas.eval` Parsers
+:func:`pandas.eval` parsers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are two different parsers and two different engines you can use as
@@ -754,7 +748,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would
in vanilla Python.
-:func:`pandas.eval` Backends
+:func:`pandas.eval` backends
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There's also the option to make :func:`~pandas.eval` operate identical to plain
@@ -779,7 +773,7 @@ is a bit slower (not by much) than evaluating the same expression in Python
%timeit pd.eval('df1 + df2 + df3 + df4', engine='python')
-:func:`pandas.eval` Performance
+:func:`pandas.eval` performance
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`~pandas.eval` is intended to speed up certain kinds of operations. In
@@ -804,7 +798,7 @@ computation. The two lines are two different engines.
This plot was created using a ``DataFrame`` with 3 columns each containing
floating point values generated using ``numpy.random.randn()``.
-Technical Minutia Regarding Expression Evaluation
+Technical minutia regarding expression evaluation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expressions that would result in an object dtype or involve datetime operations
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index 3d89fe171a343..f9a72b87e58d8 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -75,7 +75,7 @@ See also :ref:`Categorical Memory Usage `.
.. _gotchas.truth:
-Using If/Truth Statements with pandas
+Using if/truth statements with pandas
-------------------------------------
pandas follows the NumPy convention of raising an error when you try to convert
@@ -317,7 +317,7 @@ See `this link ``
+ to each subsequent lambda.
+
+ .. ipython:: python
+
+ grouped['C'].agg([lambda x: x.max() - x.min(),
+ lambda x: x.median() - x.mean()])
+
+
+
.. _groupby.aggregate.named:
-Named Aggregation
+Named aggregation
~~~~~~~~~~~~~~~~~
.. versionadded:: 0.25.0
@@ -595,7 +618,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation",
animals.groupby("kind").agg(
min_height=pd.NamedAgg(column='height', aggfunc='min'),
max_height=pd.NamedAgg(column='height', aggfunc='max'),
- average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
+ average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean),
)
@@ -606,7 +629,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation",
animals.groupby("kind").agg(
min_height=('height', 'min'),
max_height=('height', 'max'),
- average_weight=('height', np.mean),
+ average_weight=('weight', np.mean),
)
@@ -630,6 +653,16 @@ requires additional arguments, partially apply them with :meth:`functools.partia
consistent. To ensure consistent ordering, the keys (and so output columns)
will always be sorted for Python 3.5.
+Named aggregation is also valid for Series groupby aggregations. In this case there's
+no column selection, so the values are just the functions.
+
+.. ipython:: python
+
+ animals.groupby("kind").height.agg(
+ min_height='min',
+ max_height='max',
+ )
+
Applying different functions to DataFrame columns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -794,13 +827,10 @@ and that the transformed data contains no NAs.
.. _groupby.transform.window_resample:
-New syntax to window and resample operations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
+Window and resample operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Working with the resample, expanding or rolling operations on the groupby
-level used to require the application of helper functions. However,
-now it is possible to use ``resample()``, ``expanding()`` and
+It is possible to use ``resample()``, ``expanding()`` and
``rolling()`` as methods on groupbys.
The example below will apply the ``rolling()`` method on the samples of
@@ -1112,7 +1142,7 @@ can be used as group keys. If so, the order of the levels will be preserved:
.. _groupby.specify:
-Grouping with a Grouper specification
+Grouping with a grouper specification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You may need to specify a bit more data to properly group. You can
@@ -1394,7 +1424,7 @@ introduction ` and the
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
-Groupby by Indexer to 'resample' data
+Groupby by indexer to 'resample' data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
index 4ea7c656fd197..cf55ce0c9a6d4 100644
--- a/doc/source/user_guide/indexing.rst
+++ b/doc/source/user_guide/indexing.rst
@@ -3,7 +3,7 @@
{{ header }}
***************************
-Indexing and Selecting Data
+Indexing and selecting data
***************************
The axis labeling information in pandas objects serves many purposes:
@@ -36,17 +36,13 @@ this area.
should be avoided. See :ref:`Returning a View versus Copy
`.
-.. warning::
-
- Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `.
-
See the :ref:`MultiIndex / Advanced Indexing ` for ``MultiIndex`` and more advanced indexing documentation.
See the :ref:`cookbook` for some advanced strategies.
.. _indexing.choice:
-Different Choices for Indexing
+Different choices for indexing
------------------------------
Object selection has had a number of user-requested additions in order to
@@ -61,14 +57,12 @@ of multi-axis indexing.
* A list or array of labels ``['a', 'b', 'c']``.
* A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
- index! See :ref:`Slicing with labels
- `.).
+ index! See :ref:`Slicing with labels `
+ and :ref:`Endpoints are inclusive `.)
* A boolean array
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
- .. versionadded:: 0.18.1
-
See more at :ref:`Selection by Label `.
* ``.iloc`` is primarily integer position based (from ``0`` to
@@ -85,8 +79,6 @@ of multi-axis indexing.
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
- .. versionadded:: 0.18.1
-
See more at :ref:`Selection by Position `,
:ref:`Advanced Indexing ` and :ref:`Advanced
Hierarchical `.
@@ -181,7 +173,7 @@ columns.
df[['A', 'B']]
-Attribute Access
+Attribute access
----------------
.. _indexing.columns.multiple:
@@ -218,7 +210,7 @@ as an attribute:
See `here for an explanation of valid identifiers
`__.
- - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed.
+ - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible.
- Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``,
``major_axis``, ``minor_axis``, ``items``.
@@ -287,7 +279,7 @@ largely as a convenience since it is such a common operation.
.. _indexing.label:
-Selection By Label
+Selection by label
------------------
.. warning::
@@ -335,8 +327,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp
* A list or array of labels ``['a', 'b', 'c']``.
* A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
- index! See :ref:`Slicing with labels
- `.).
+ index! See :ref:`Slicing with labels `.
* A boolean array.
* A ``callable``, see :ref:`Selection By Callable `.
@@ -418,9 +409,12 @@ error will be raised (since doing otherwise would be computationally expensive,
as well as potentially ambiguous for mixed type indexes). For instance, in the
above example, ``s.loc[1:6]`` would raise ``KeyError``.
+For the rationale behind this behavior, see
+:ref:`Endpoints are inclusive `.
+
.. _indexing.integer:
-Selection By Position
+Selection by position
---------------------
.. warning::
@@ -533,11 +527,9 @@ A list of indexers where any element is out of bounds will raise an
.. _indexing.callable:
-Selection By Callable
+Selection by callable
---------------------
-.. versionadded:: 0.18.1
-
``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer.
The ``callable`` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.
@@ -548,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat
columns=list('ABCD'))
df1
- df1.loc[lambda df: df.A > 0, :]
+ df1.loc[lambda df: df['A'] > 0, :]
df1.loc[:, lambda df: ['A', 'B']]
df1.iloc[:, lambda df: [0, 1]]
@@ -560,7 +552,7 @@ You can use callable indexing in ``Series``.
.. ipython:: python
- df1.A.loc[lambda s: s > 0]
+ df1['A'].loc[lambda s: s > 0]
Using these methods / indexers, you can chain data selection operations
without using a temporary variable.
@@ -569,11 +561,11 @@ without using a temporary variable.
bb = pd.read_csv('data/baseball.csv', index_col='id')
(bb.groupby(['year', 'team']).sum()
- .loc[lambda df: df.r > 100])
+ .loc[lambda df: df['r'] > 100])
.. _indexing.deprecate_ix:
-IX Indexer is Deprecated
+IX indexer is deprecated
------------------------
.. warning::
@@ -631,7 +623,7 @@ For getting *multiple* indexers, using ``.get_indexer``:
.. _deprecate_loc_reindex_listlike:
.. _indexing.deprecate_loc_reindex_listlike:
-Indexing with list with missing labels is Deprecated
+Indexing with list with missing labels is deprecated
----------------------------------------------------
.. warning::
@@ -655,7 +647,7 @@ Selection with all keys found is unchanged.
s.loc[[1, 2]]
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -667,7 +659,7 @@ Previous Behavior
dtype: float64
-Current Behavior
+Current behavior
.. code-block:: ipython
@@ -732,7 +724,7 @@ However, this would *still* raise if your resulting index is duplicated.
.. _indexing.basics.partial_setting:
-Selecting Random Samples
+Selecting random samples
------------------------
A random selection of rows or columns from a Series or DataFrame with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows.
@@ -807,7 +799,7 @@ Finally, one can also set a seed for ``sample``'s random number generator using
-Setting With Enlargement
+Setting with enlargement
------------------------
The ``.loc/[]`` operations can perform enlargement when setting a non-existent key for that axis.
@@ -879,9 +871,9 @@ Boolean indexing
Another common operation is the use of boolean vectors to filter the data.
The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``.
These **must** be grouped by using parentheses, since by default Python will
-evaluate an expression such as ``df.A > 2 & df.B < 3`` as
-``df.A > (2 & df.B) < 3``, while the desired evaluation order is
-``(df.A > 2) & (df.B < 3)``.
+evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as
+``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is
+``(df['A > 2) & (df['B'] < 3)``.
Using a boolean vector to index a Series works exactly as in a NumPy ndarray:
@@ -1076,7 +1068,7 @@ without creating a copy:
df.where(df < 0, -df) == np.where(df < 0, df, -df)
-**alignment**
+**Alignment**
Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame),
such that partial selection with setting is possible. This is analogous to
@@ -1103,9 +1095,7 @@ This is equivalent to (but faster than) the following.
df2 = df.copy()
df.apply(lambda x, y: x.where(x > 0, y), y=df['A'])
-.. versionadded:: 0.18.1
-
-Where can accept a callable as condition and ``other`` arguments. The function must
+``where`` can accept a callable as condition and ``other`` arguments. The function must
be with one argument (the calling Series or DataFrame) and that returns valid output
as condition and ``other`` argument.
@@ -1144,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example:
df
# pure python
- df[(df.a < df.b) & (df.b < df.c)]
+ df[(df['a'] < df['b']) & (df['b'] < df['c'])]
# query
df.query('(a < b) & (b < c)')
@@ -1251,7 +1241,7 @@ Full numpy-like syntax:
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
df.query('(a < b) & (b < c)')
- df[(df.a < df.b) & (df.b < df.c)]
+ df[(df['a'] < df['b']) & (df['b'] < df['c'])]
Slightly nicer by removing the parentheses (by binding making comparison
operators bind tighter than ``&`` and ``|``).
@@ -1289,12 +1279,12 @@ The ``in`` and ``not in`` operators
df.query('a in b')
# How you'd do it in pure Python
- df[df.a.isin(df.b)]
+ df[df['a'].isin(df['b'])]
df.query('a not in b')
# pure Python
- df[~df.a.isin(df.b)]
+ df[~df['a'].isin(df['b'])]
You can combine this with other expressions for very succinct queries:
@@ -1307,7 +1297,7 @@ You can combine this with other expressions for very succinct queries:
df.query('a in b and c < d')
# pure Python
- df[df.b.isin(df.a) & (df.c < df.d)]
+ df[df['b'].isin(df['a']) & (df['c'] < df['d'])]
.. note::
@@ -1336,7 +1326,7 @@ to ``in``/``not in``.
df.query('b == ["a", "b", "c"]')
# pure Python
- df[df.b.isin(["a", "b", "c"])]
+ df[df['b'].isin(["a", "b", "c"])]
df.query('c == [1, 2]')
@@ -1348,10 +1338,10 @@ to ``in``/``not in``.
df.query('[1, 2] not in c')
# pure Python
- df[df.c.isin([1, 2])]
+ df[df['c'].isin([1, 2])]
-Boolean Operators
+Boolean operators
~~~~~~~~~~~~~~~~~
You can negate boolean expressions with the word ``not`` or the ``~`` operator.
@@ -1362,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator.
df['bools'] = np.random.rand(len(df)) > 0.5
df.query('~bools')
df.query('not bools')
- df.query('not bools') == df[~df.bools]
+ df.query('not bools') == df[~df['bools']]
Of course, expressions can be arbitrarily complex too:
@@ -1372,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too:
shorter = df.query('a < b < c and (not bools) or bools > 2')
# equivalent in pure Python
- longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)]
+ longer = df[(df['a'] < df['b'])
+ & (df['b'] < df['c'])
+ & (~df['bools'])
+ | (df['bools'] > 2)]
shorter
longer
@@ -1407,7 +1400,7 @@ floating point values generated using ``numpy.random.randn()``.
df2 = df.copy()
-Duplicate Data
+Duplicate data
--------------
.. _indexing.duplicate:
@@ -1474,7 +1467,7 @@ default value.
s.get('a') # equivalent to s['a']
s.get('x', default=-1)
-The :meth:`~pandas.DataFrame.lookup` Method
+The :meth:`~pandas.DataFrame.lookup` method
-------------------------------------------
Sometimes you want to extract a set of values given a sequence of row labels
@@ -1559,11 +1552,11 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes.
index.levels[1]
index.set_levels(["a", "b"], level=1)
+.. _indexing.set_ops:
+
Set operations on Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. _indexing.set_ops:
-
The two main operations are ``union (|)`` and ``intersection (&)``.
These can be directly called as instance methods or used via overloaded
operators. Difference is provided via the ``.difference()`` method.
@@ -1592,11 +1585,22 @@ with duplicates dropped.
The resulting index from a set operation will be sorted in ascending order.
-Missing values
-~~~~~~~~~~~~~~
+When performing :meth:`Index.union` between indexes with different dtypes, the indexes
+must be cast to a common dtype. Typically, though not always, this is object dtype. The
+exception is when performing a union between integer and float data. In this case, the
+integer values are converted to float
+
+.. ipython:: python
+
+ idx1 = pd.Index([0, 1, 2])
+ idx2 = pd.Index([0.5, 1.5])
+ idx1 | idx2
.. _indexing.missing:
+Missing values
+~~~~~~~~~~~~~~
+
.. important::
Even though ``Index`` can hold missing values (``NaN``), it should be avoided
@@ -1617,18 +1621,18 @@ Missing values
idx2
idx2.fillna(pd.Timestamp('2011-01-02'))
-Set / Reset Index
+Set / reset index
-----------------
Occasionally you will load or create a data set into a DataFrame and want to
add an index after you've already done so. There are a couple of different
ways.
+.. _indexing.set_index:
+
Set an index
~~~~~~~~~~~~
-.. _indexing.set_index:
-
DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name
(for a regular ``Index``) or a list of column names (for a ``MultiIndex``).
To create a new, re-indexed DataFrame:
@@ -1834,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option `
# This will show the SettingWithCopyWarning
# but the frame values will be set
- dfb['c'][dfb.a.str.startswith('o')] = 42
+ dfb['c'][dfb['a'].str.startswith('o')] = 42
This however is operating on a copy and will not work.
::
>>> pd.set_option('mode.chained_assignment','warn')
- >>> dfb[dfb.a.str.startswith('o')]['c'] = 42
+ >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42
Traceback (most recent call last)
...
SettingWithCopyWarning:
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
index c5667e9319ca6..97b9c2f95dc50 100644
--- a/doc/source/user_guide/integer_na.rst
+++ b/doc/source/user_guide/integer_na.rst
@@ -5,7 +5,7 @@
.. _integer_na:
**************************
-Nullable Integer Data Type
+Nullable integer data type
**************************
.. versionadded:: 0.24.0
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 838018eda0a34..5a90ae23dc84d 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -13,7 +13,7 @@
===============================
-IO Tools (Text, CSV, HDF5, ...)
+IO tools (text, CSV, HDF5, ...)
===============================
The pandas I/O API is a set of top level ``reader`` functions accessed like
@@ -28,16 +28,19 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
:delim: ;
text;`CSV `__;:ref:`read_csv`;:ref:`to_csv`
+ text;Fixed-Width Text File;:ref:`read_fwf`
text;`JSON `__;:ref:`read_json`;:ref:`to_json`
text;`HTML `__;:ref:`read_html`;:ref:`to_html`
text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard`
binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel`
+ binary;`OpenDocument `__;:ref:`read_excel`;
binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf`
binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather`
binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet`
binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack`
binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata`
binary;`SAS `__;:ref:`read_sas`;
+ binary;`SPSS `__;:ref:`read_spss`;
binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle`
SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql`
SQL;`Google Big Query `__;:ref:`read_gbq`;:ref:`to_gbq`
@@ -51,7 +54,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
.. _io.read_csv_table:
-CSV & Text files
+CSV & text files
----------------
The workhorse function for reading text files (a.k.a. flat files) is
@@ -86,9 +89,7 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.
- .. versionadded:: 0.18.1 support for the Python parser.
-
-Column and Index Locations and Names
+Column and index locations and names
++++++++++++++++++++++++++++++++++++
header : int or list of ints, default ``'infer'``
@@ -108,8 +109,7 @@ header : int or list of ints, default ``'infer'``
line of data rather than the first line of the file.
names : array-like, default ``None``
List of column names to use. If file contains no header row, then you should
- explicitly pass ``header=None``. Duplicates in this list will cause
- a ``UserWarning`` to be issued.
+ explicitly pass ``header=None``. Duplicates in this list are not allowed.
index_col : int, str, sequence of int / str, or False, default ``None``
Column(s) to use as the row labels of the ``DataFrame``, either given as
string name or column index. If a sequence of int / str is given, a
@@ -155,7 +155,7 @@ mangle_dupe_cols : boolean, default ``True``
Passing in ``False`` will cause data to be overwritten if there are duplicate
names in the columns.
-General Parsing Configuration
+General parsing configuration
+++++++++++++++++++++++++++++
dtype : Type name or dict of column -> type, default ``None``
@@ -211,7 +211,7 @@ memory_map : boolean, default False
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.
-NA and Missing Data Handling
+NA and missing data handling
++++++++++++++++++++++++++++
na_values : scalar, str, list-like, or dict, default ``None``
@@ -243,7 +243,7 @@ verbose : boolean, default ``False``
skip_blank_lines : boolean, default ``True``
If ``True``, skip over blank lines rather than interpreting as NaN values.
-Datetime Handling
+Datetime handling
+++++++++++++++++
parse_dates : boolean or list of ints or names or list of lists or dict, default ``False``.
@@ -263,7 +263,7 @@ keep_date_col : boolean, default ``False``
date_parser : function, default ``None``
Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call date_parser in three different ways,
+ conversion. pandas will try to call date_parser in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays (as
defined by parse_dates) as arguments; 2) concatenate (row-wise) the string
values from the columns defined by parse_dates into a single array and pass
@@ -288,7 +288,7 @@ chunksize : int, default ``None``
Return `TextFileReader` object for iteration. See :ref:`iterating and chunking
` below.
-Quoting, Compression, and File Format
+Quoting, compression, and file format
+++++++++++++++++++++++++++++++++++++
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
@@ -298,7 +298,6 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``
the ZIP file must contain only one data file to be read in.
Set to ``None`` for no decompression.
- .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
.. versionchanged:: 0.24.0 'infer' option added and set to default.
thousands : str, default ``None``
Thousands separator.
@@ -340,15 +339,8 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None``
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
documentation for more details.
-tupleize_cols : boolean, default ``False``
- .. deprecated:: 0.21.0
-
- This argument will be removed and will always convert to MultiIndex
- Leave a list of tuples on columns as is (default is to convert to a MultiIndex
- on the columns).
-
-Error Handling
+Error handling
++++++++++++++
error_bad_lines : boolean, default ``True``
@@ -460,11 +452,9 @@ worth trying.
.. _io.categorical:
-Specifying Categorical dtype
+Specifying categorical dtype
''''''''''''''''''''''''''''
-.. versionadded:: 0.19.0
-
``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
``dtype=CategoricalDtype(categories, ordered)``.
@@ -488,7 +478,7 @@ specification:
.. versionadded:: 0.21.0
-Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+Specifying ``dtype='category'`` will result in an unordered ``Categorical``
whose ``categories`` are the unique values observed in the data. For more
control on the categories and order, create a
:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
@@ -529,7 +519,7 @@ This matches the behavior of :meth:`Categorical.set_categories`.
df['col3']
-Naming and Using Columns
+Naming and using columns
''''''''''''''''''''''''
.. _io.headers:
@@ -646,7 +636,7 @@ use in the final result:
In this case, the callable is specifying that we exclude the "a" and "c"
columns from the output.
-Comments and Empty Lines
+Comments and empty lines
''''''''''''''''''''''''
.. _io.skiplines:
@@ -759,7 +749,7 @@ We can suppress the comments using the ``comment`` keyword:
.. _io.unicode:
-Dealing with Unicode Data
+Dealing with Unicode data
'''''''''''''''''''''''''
The ``encoding`` argument should be used for encoded unicode data, which will
@@ -834,7 +824,7 @@ If a subset of data is being parsed using the ``usecols`` option, the
Date Handling
'''''''''''''
-Specifying Date Columns
+Specifying date columns
+++++++++++++++++++++++
To better facilitate working with datetime data, :func:`read_csv`
@@ -947,7 +937,7 @@ data columns:
specify `index_col` as a column label rather then as an index on the resulting frame.
-Date Parsing Functions
+Date parsing functions
++++++++++++++++++++++
Finally, the parser allows you to specify a custom ``date_parser`` function to
@@ -1001,7 +991,7 @@ a single date rather than the entire array.
.. _io.csv.mixed_timezones:
-Parsing a CSV with mixed Timezones
+Parsing a CSV with mixed timezones
++++++++++++++++++++++++++++++++++
Pandas cannot natively represent a column or index with mixed timezones. If your CSV
@@ -1031,7 +1021,7 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie
.. _io.dayfirst:
-Inferring Datetime Format
+Inferring datetime format
+++++++++++++++++++++++++
If you have ``parse_dates`` enabled for some or all of your columns, and your
@@ -1070,7 +1060,7 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With
os.remove('foo.csv')
-International Date Formats
+International date formats
++++++++++++++++++++++++++
While US date formats tend to be MM/DD/YYYY, many international formats use
@@ -1118,7 +1108,7 @@ writing to a file). For example:
.. _io.thousands:
-Thousand Separators
+Thousand separators
'''''''''''''''''''
For large numbers that have been written with a thousands separator, you can
@@ -1163,7 +1153,7 @@ The ``thousands`` keyword allows integers to be parsed correctly:
.. _io.na_values:
-NA Values
+NA values
'''''''''
To control which values are parsed as missing values (which are signified by
@@ -1383,9 +1373,10 @@ should pass the ``escapechar`` option:
print(data)
pd.read_csv(StringIO(data), escapechar='\\')
+.. _io.fwf_reader:
.. _io.fwf:
-Files with Fixed Width Columns
+Files with fixed width columns
''''''''''''''''''''''''''''''
While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works
@@ -1679,14 +1670,14 @@ S3 URLs are handled as well but require installing the `S3Fs
df = pd.read_csv('s3://pandas-test/tips.csv')
-If your S3 bucket requires cedentials you will need to set them as environment
+If your S3 bucket requires credentials you will need to set them as environment
variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs
documentation on credentials
`_.
-Writing out Data
+Writing out data
''''''''''''''''
.. _io.store_in_csv:
@@ -1718,8 +1709,6 @@ function takes a number of arguments. Only the first is required.
* ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when
appropriate (default None)
* ``chunksize``: Number of rows to write at a time
-* ``tupleize_cols``: If False (default), write as a list of tuples, otherwise
- write in an expanded line format suitable for ``read_csv``
* ``date_format``: Format string for datetime objects
Writing a formatted string
@@ -1805,7 +1794,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet
json = dfj.to_json()
json
-Orient Options
+Orient options
++++++++++++++
There are a number of different options for the format of the resulting JSON
@@ -1869,7 +1858,7 @@ preservation of metadata including but not limited to dtypes and index names.
index and column labels during round-trip serialization. If you wish to preserve
label ordering use the `split` option as it uses ordered containers.
-Date Handling
+Date handling
+++++++++++++
Writing in ISO date format:
@@ -1910,7 +1899,7 @@ Writing to a file, with a date index and a date column:
with open('test.json') as fh:
print(fh.read())
-Fallback Behavior
+Fallback behavior
+++++++++++++++++
If the JSON serializer cannot handle the container contents directly it will
@@ -2003,7 +1992,7 @@ If a non-default ``orient`` was used when encoding to JSON be sure to pass the s
option here so that decoding produces sensible results, see `Orient Options`_ for an
overview.
-Data Conversion
+Data conversion
+++++++++++++++
The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True``
@@ -2078,7 +2067,7 @@ Dates written in nanoseconds need to be read back in nanoseconds:
json = dfj2.to_json(date_unit='ns')
- # Try to parse timestamps as millseconds -> Won't Work
+ # Try to parse timestamps as milliseconds -> Won't Work
dfju = pd.read_json(json, date_unit='ms')
dfju
@@ -2090,7 +2079,7 @@ Dates written in nanoseconds need to be read back in nanoseconds:
dfju = pd.read_json(json, date_unit='ns')
dfju
-The Numpy Parameter
+The Numpy parameter
+++++++++++++++++++
.. note::
@@ -2186,13 +2175,24 @@ into a flat table.
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
+The max_level parameter provides more control over which level to end normalization.
+With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict.
+
+.. ipython:: python
+
+ data = [{'CreatedBy': {'Name': 'User001'},
+ 'Lookup': {'TextField': 'Some text',
+ 'UserField': {'Id': 'ID001',
+ 'Name': 'Name001'}},
+ 'Image': {'a': 'b'}
+ }]
+ json_normalize(data, max_level=1)
+
.. _io.jsonl:
Line delimited json
'''''''''''''''''''
-.. versionadded:: 0.19.0
-
pandas is able to read and write line-delimited json files that are common in data processing pipelines
using Hadoop or Spark.
@@ -2218,7 +2218,7 @@ For line-delimited json files, pandas can also return an iterator which reads in
.. _io.table_schema:
-Table Schema
+Table schema
''''''''''''
.. versionadded:: 0.20.0
@@ -2378,7 +2378,7 @@ HTML
.. _io.read_html:
-Reading HTML Content
+Reading HTML content
''''''''''''''''''''''
.. warning::
@@ -2490,16 +2490,12 @@ Specify values that should be converted to NaN:
dfs = pd.read_html(url, na_values=['No Acquirer'])
-.. versionadded:: 0.19
-
Specify whether to keep the default set of NaN values:
.. code-block:: python
dfs = pd.read_html(url, keep_default_na=False)
-.. versionadded:: 0.19
-
Specify converters for columns. This is useful for numerical text data that has
leading zeros. By default columns that are numerical are cast to numeric
types and the leading zeros are lost. To avoid this, we can convert these
@@ -2511,8 +2507,6 @@ columns to strings.
dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0,
converters={'MNC': str})
-.. versionadded:: 0.19
-
Use some combination of the above:
.. code-block:: python
@@ -2788,16 +2782,17 @@ parse HTML tables in the top-level pandas io function ``read_html``.
Excel files
-----------
-The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and
-Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python
-module. The :meth:`~DataFrame.to_excel` instance method is used for
+The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
+files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files
+can be read using either ``xlrd`` or ``openpyxl``.
+The :meth:`~DataFrame.to_excel` instance method is used for
saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv` data.
See the :ref:`cookbook` for some advanced strategies.
.. _io.excel_reader:
-Reading Excel Files
+Reading Excel files
'''''''''''''''''''
In the most basic use-case, ``read_excel`` takes a path to an Excel
@@ -2879,7 +2874,7 @@ with ``on_demand=True``.
.. _io.excel.specifying_sheets:
-Specifying Sheets
+Specifying sheets
+++++++++++++++++
.. note :: The second argument is ``sheet_name``, not to be confused with ``ExcelFile.sheet_names``.
@@ -2980,7 +2975,7 @@ should be passed to ``index_col`` and ``header``:
os.remove('path_to_file.xlsx')
-Parsing Specific Columns
+Parsing specific columns
++++++++++++++++++++++++
It is often the case that users will insert columns to do temporary computations
@@ -3035,7 +3030,7 @@ the column names, returning names where the callable function evaluates to ``Tru
pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha())
-Parsing Dates
+Parsing dates
+++++++++++++
Datetime-like values are normally automatically converted to the appropriate
@@ -3048,7 +3043,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes:
pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings'])
-Cell Converters
+Cell converters
+++++++++++++++
It is possible to transform the contents of Excel cells via the ``converters``
@@ -3073,7 +3068,7 @@ missing data to recover integer dtype:
pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
-dtype Specifications
+Dtype specifications
++++++++++++++++++++
.. versionadded:: 0.20
@@ -3089,10 +3084,10 @@ no type inference, use the type ``str`` or ``object``.
.. _io.excel_writer:
-Writing Excel Files
+Writing Excel files
'''''''''''''''''''
-Writing Excel Files to Disk
+Writing Excel files to disk
+++++++++++++++++++++++++++
To write a ``DataFrame`` object to a sheet of an Excel file, you can use the
@@ -3138,7 +3133,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
.. _io.excel_writing_buffer:
-Writing Excel Files to Memory
+Writing Excel files to memory
+++++++++++++++++++++++++++++
Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or
@@ -3218,7 +3213,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
.. _io.excel.style:
-Style and Formatting
+Style and formatting
''''''''''''''''''''
The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method.
@@ -3226,7 +3221,31 @@ The look and feel of Excel worksheets created from pandas can be modified using
* ``float_format`` : Format string for floating point numbers (default ``None``).
* ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``).
+Using the `Xlsxwriter`_ engine provides many options for controlling the
+format of an Excel worksheet created with the ``to_excel`` method. Excellent examples can be found in the
+`Xlsxwriter`_ documentation here: https://xlsxwriter.readthedocs.io/working_with_pandas.html
+
+.. _io.ods:
+
+OpenDocument Spreadsheets
+-------------------------
+
+.. versionadded:: 0.25
+The :func:`~pandas.read_excel` method can also read OpenDocument spreadsheets
+using the ``odfpy`` module. The semantics and features for reading
+OpenDocument spreadsheets match what can be done for `Excel files`_ using
+``engine='odf'``.
+
+.. code-block:: python
+
+ # Returns a DataFrame
+ pd.read_excel('path_to_file.ods', engine='odf')
+
+.. note::
+
+ Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
+ is not implemented.
.. _io.clipboard:
@@ -3249,24 +3268,35 @@ And then import the data directly to a ``DataFrame`` by calling:
.. code-block:: python
- clipdf = pd.read_clipboard()
-
-.. ipython:: python
-
- clipdf
-
+ >>> clipdf = pd.read_clipboard()
+ >>> clipdf
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
The ``to_clipboard`` method can be used to write the contents of a ``DataFrame`` to
the clipboard. Following which you can paste the clipboard contents into other
applications (CTRL-V on many operating systems). Here we illustrate writing a
``DataFrame`` into clipboard and reading it back.
-.. ipython:: python
+.. code-block:: python
- df = pd.DataFrame(np.random.randn(5, 3))
- df
- df.to_clipboard()
- pd.read_clipboard()
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [4, 5, 6],
+ ... 'C': ['p', 'q', 'r']},
+ ... index=['x', 'y', 'z'])
+ >>> df
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
+ >>> df.to_clipboard()
+ >>> pd.read_clipboard()
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
We can see that we got the same content back, which we had earlier written to the clipboard.
@@ -3308,16 +3338,7 @@ any pickled pandas object (or any other pickled object) from file:
.. warning::
- Several internal refactoring have been done while still preserving
- compatibility with pickles created with older versions of pandas. However,
- for such cases, pickled ``DataFrames``, ``Series`` etc, must be read with
- ``pd.read_pickle``, rather than ``pickle.load``.
-
- See `here `__
- and `here `__
- for some examples of compatibility-breaking changes. See
- `this question `__
- for a detailed explanation.
+ :func:`read_pickle` is only guaranteed backwards compatible back to pandas version 0.20.3
.. _io.pickle.compression:
@@ -3391,11 +3412,15 @@ both on the writing (serialization), and reading (deserialization).
.. warning::
- This is a very new feature of pandas. We intend to provide certain
- optimizations in the io of the ``msgpack`` data. Since this is marked
- as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release.
+ The msgpack format is deprecated as of 0.25 and will be removed in a future version.
+ It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
+
+.. warning::
+
+ :func:`read_msgpack` is only guaranteed backwards compatible back to pandas version 0.20.3
.. ipython:: python
+ :okwarning:
df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
df.to_msgpack('foo.msg')
@@ -3405,6 +3430,7 @@ both on the writing (serialization), and reading (deserialization).
You can pass a list of objects and you will receive them back on deserialization.
.. ipython:: python
+ :okwarning:
pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s)
pd.read_msgpack('foo.msg')
@@ -3412,6 +3438,7 @@ You can pass a list of objects and you will receive them back on deserialization
You can pass ``iterator=True`` to iterate over the unpacked results:
.. ipython:: python
+ :okwarning:
for o in pd.read_msgpack('foo.msg', iterator=True):
print(o)
@@ -3419,6 +3446,7 @@ You can pass ``iterator=True`` to iterate over the unpacked results:
You can pass ``append=True`` to the writer to append to an existing pack:
.. ipython:: python
+ :okwarning:
df.to_msgpack('foo.msg', append=True)
pd.read_msgpack('foo.msg')
@@ -3429,6 +3457,7 @@ can pack arbitrary collections of Python lists, dicts, scalars, while intermixin
pandas objects.
.. ipython:: python
+ :okwarning:
pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'},
{'scalar': 1.}, {'s': s}]})
@@ -3441,20 +3470,22 @@ pandas objects.
os.remove('foo.msg')
os.remove('foo2.msg')
-Read/Write API
+Read/write API
''''''''''''''
Msgpacks can also be read from and written to strings.
.. ipython:: python
+ :okwarning:
df.to_msgpack()
Furthermore you can concatenate the strings to produce a list of the original objects.
.. ipython:: python
+ :okwarning:
- pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
+ pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
.. _io.hdf5:
@@ -3540,10 +3571,10 @@ Closing a Store and using a context manager:
-Read/Write API
+Read/write API
''''''''''''''
-``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing,
+``HDFStore`` supports a top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing,
similar to how ``read_csv`` and ``to_csv`` work.
.. ipython:: python
@@ -3586,7 +3617,7 @@ HDFStore will by default not drop rows that are all missing. This behavior can b
.. _io.hdf5-fixed:
-Fixed Format
+Fixed format
''''''''''''
The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called
@@ -3610,7 +3641,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for
.. _io.hdf5-table:
-Table Format
+Table format
''''''''''''
``HDFStore`` supports another ``PyTables`` format on disk, the ``table``
@@ -3652,13 +3683,13 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format.
.. _io.hdf5-keys:
-Hierarchical Keys
+Hierarchical keys
'''''''''''''''''
Keys to a store can be specified as a string. These can be in a
hierarchical path-name like format (e.g. ``foo/bar/bah``), which will
generate a hierarchy of sub-stores (or ``Groups`` in PyTables
-parlance). Keys can be specified with out the leading '/' and are **always**
+parlance). Keys can be specified without the leading '/' and are **always**
absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove
everything in the sub-store and **below**, so be *careful*.
@@ -3719,10 +3750,10 @@ will yield a tuple for each group key along with the relative keys of its conten
.. _io.hdf5-types:
-Storing Types
+Storing types
'''''''''''''
-Storing Mixed Types in a Table
+Storing mixed types in a table
++++++++++++++++++++++++++++++
Storing mixed-dtype data is supported. Strings are stored as a
@@ -3752,7 +3783,7 @@ defaults to `nan`.
store.append('df_mixed', df_mixed, min_itemsize={'values': 50})
df_mixed1 = store.select('df_mixed')
df_mixed1
- df_mixed1.get_dtype_counts()
+ df_mixed1.dtypes.value_counts()
# we have provided a minimum string column size
store.root.df_mixed.table
@@ -3786,7 +3817,7 @@ storing/selecting from homogeneous index ``DataFrames``.
Querying
''''''''
-Querying a Table
+Querying a table
++++++++++++++++
``select`` and ``delete`` operations have an optional criterion that can
@@ -3796,7 +3827,7 @@ data.
A query is specified using the ``Term`` class under the hood, as a boolean expression.
-* ``index`` and ``columns`` are supported indexers of a ``DataFrames``.
+* ``index`` and ``columns`` are supported indexers of ``DataFrames``.
* if ``data_columns`` are specified, these can be used as additional indexers.
Valid comparison operators are:
@@ -3888,7 +3919,7 @@ Use boolean expressions, with in-line function evaluation.
store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']")
-Use and inline column reference
+Use inline column reference.
.. ipython:: python
@@ -3992,7 +4023,7 @@ See `here `` to the first ``append``,
- to set the TOTAL number of expected rows that ``PyTables`` will
- expected. This will optimize read/write performance.
+ to set the TOTAL number of rows that ``PyTables`` will expect.
+ This will optimize read/write performance.
* Duplicate rows can be written to tables, but are filtered out in
selection (with the last items being selected; thus a table is
unique on major, minor pairs)
@@ -4703,6 +4734,7 @@ See the documentation for `pyarrow `__ an
Write to a parquet file.
.. ipython:: python
+ :okwarning:
df.to_parquet('example_pa.parquet', engine='pyarrow')
df.to_parquet('example_fp.parquet', engine='fastparquet')
@@ -4720,6 +4752,7 @@ Read from a parquet file.
Read only certain columns of a parquet file.
.. ipython:: python
+ :okwarning:
result = pd.read_parquet('example_fp.parquet',
engine='fastparquet', columns=['a', 'b'])
@@ -4735,13 +4768,14 @@ Read only certain columns of a parquet file.
os.remove('example_fp.parquet')
-Handling Indexes
+Handling indexes
''''''''''''''''
Serializing a ``DataFrame`` to parquet may include the implicit index as one or
more columns in the output file. Thus, this code:
.. ipython:: python
+ :okwarning:
df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
df.to_parquet('test.parquet', engine='pyarrow')
@@ -4758,6 +4792,7 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to
:func:`~pandas.DataFrame.to_parquet`:
.. ipython:: python
+ :okwarning:
df.to_parquet('test.parquet', index=False)
@@ -4815,7 +4850,7 @@ The above example creates a partitioned dataset that may look like:
.. _io.sql:
-SQL Queries
+SQL queries
-----------
The :mod:`pandas.io.sql` module provides a collection of query wrappers to both
@@ -4984,7 +5019,7 @@ will convert the data to UTC.
.. _io.sql.method:
-Insertion Method
+Insertion method
++++++++++++++++
.. versionadded:: 0.24.0
@@ -5041,7 +5076,7 @@ Example of a callable using PostgreSQL `COPY clause
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
-Reading Tables
+Reading tables
''''''''''''''
:func:`~pandas.read_sql_table` will read a database table given the
@@ -5261,12 +5296,12 @@ Full documentation can be found `here `__.
.. _io.stata:
-Stata Format
+Stata format
------------
.. _io.stata_writer:
-Writing to Stata format
+Writing to stata format
'''''''''''''''''''''''
The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame
@@ -5380,7 +5415,7 @@ values will have ``object`` data type.
.. _io.stata-categorical:
-Categorical Data
+Categorical data
++++++++++++++++
``Categorical`` data can be exported to *Stata* data files as value labeled data.
@@ -5426,7 +5461,7 @@ whether imported ``Categorical`` variables are ordered.
.. _io.sas_reader:
-SAS Formats
+SAS formats
-----------
The top-level function :func:`read_sas` can read (but not write) SAS
@@ -5468,6 +5503,43 @@ web site.
No official documentation is available for the SAS7BDAT format.
+.. _io.spss:
+
+.. _io.spss_reader:
+
+SPSS formats
+------------
+
+.. versionadded:: 0.25.0
+
+The top-level function :func:`read_spss` can read (but not write) SPSS
+`sav` (.sav) and `zsav` (.zsav) format files.
+
+SPSS files contain column names. By default the
+whole file is read, categorical columns are converted into ``pd.Categorical``,
+and a ``DataFrame`` with all columns is returned.
+
+Specify the ``usecols`` parameter to obtain a subset of columns. Specify ``convert_categoricals=False``
+to avoid converting categorical columns into ``pd.Categorical``.
+
+Read an SPSS file:
+
+.. code-block:: python
+
+ df = pd.read_spss('spss_data.sav')
+
+Extract a subset of columns contained in ``usecols`` from an SPSS file and
+avoid converting categorical columns into ``pd.Categorical``:
+
+.. code-block:: python
+
+ df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'],
+ convert_categoricals=False)
+
+More information about the `sav` and `zsav` file format is available here_.
+
+.. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm
+
.. _io.other:
Other file formats
@@ -5488,7 +5560,7 @@ easy conversion to and from pandas.
.. _io.perf:
-Performance Considerations
+Performance considerations
--------------------------
This is an informal comparison of various IO methods, using pandas
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index 25c486c839b7f..4c0d3b75a4f79 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -70,9 +70,8 @@ some configurable handling of "what to do with the other axes":
::
- pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
- keys=None, levels=None, names=None, verify_integrity=False,
- copy=True)
+ pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
+ levels=None, names=None, verify_integrity=False, copy=True)
* ``objs`` : a sequence or mapping of Series or DataFrame objects. If a
dict is passed, the sorted keys will be used as the `keys` argument, unless
@@ -87,8 +86,6 @@ some configurable handling of "what to do with the other axes":
n - 1. This is useful if you are concatenating objects where the
concatenation axis does not have meaningful indexing information. Note
the index values on the other axes are still respected in the join.
-* ``join_axes`` : list of Index objects. Specific indexes to use for the other
- n - 1 axes instead of performing inner/outer set logic.
* ``keys`` : sequence, default None. Construct hierarchical index using the
passed keys as the outermost level. If multiple levels passed, should
contain tuples.
@@ -147,12 +144,11 @@ Set logic on the other axes
When gluing together multiple DataFrames, you have a choice of how to handle
the other axes (other than the one being concatenated). This can be done in
-the following three ways:
+the following two ways:
* Take the union of them all, ``join='outer'``. This is the default
option as it results in zero information loss.
* Take the intersection, ``join='inner'``.
-* Use a specific index, as passed to the ``join_axes`` argument.
Here is an example of each of these methods. First, the default ``join='outer'``
behavior:
@@ -202,7 +198,13 @@ DataFrame:
.. ipython:: python
- result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])
+ result = pd.concat([df1, df4], axis=1).reindex(df1.index)
+
+Similarly, we could index before the concatenation:
+
+.. ipython:: python
+
+ pd.concat([df1, df4.reindex(df1.index)], axis=1)
.. ipython:: python
:suppress:
@@ -814,11 +816,9 @@ The ``indicator`` argument will also accept string arguments, in which case the
.. _merging.dtypes:
-Merge Dtypes
+Merge dtypes
~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
Merging will preserve the dtype of the join keys.
.. ipython:: python
@@ -1361,7 +1361,7 @@ Timeseries friendly merging
.. _merging.merge_ordered:
-Merging Ordered Data
+Merging ordered data
~~~~~~~~~~~~~~~~~~~~
A :func:`merge_ordered` function allows combining time series and other
@@ -1381,11 +1381,9 @@ fill/interpolate missing data:
.. _merging.merge_asof:
-Merging AsOf
+Merging asof
~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
A :func:`merge_asof` is similar to an ordered left-join except that we match on
nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``,
we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index 7883814e91c94..6c36a6470f841 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -74,7 +74,7 @@ Series and DataFrame objects:
df2['one'] == np.nan
-Integer Dtypes and Missing Data
+Integer dtypes and missing data
-------------------------------
Because ``NaN`` is a float, a column of integers with even one missing values
@@ -105,7 +105,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``.
df2
df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan
df2
- df2.get_dtype_counts()
+ df2.dtypes.value_counts()
.. _missing.inserting:
@@ -175,7 +175,7 @@ account for missing data. For example:
.. _missing_data.numeric_sum:
-Sum/Prod of Empties/Nans
+Sum/prod of empties/nans
~~~~~~~~~~~~~~~~~~~~~~~~
.. warning::
@@ -348,7 +348,8 @@ that, by default, performs linear interpolation at missing data points.
np.random.seed(123456)
idx = pd.date_range('1/1/2000', periods=100, freq='BM')
ts = pd.Series(np.random.randn(100), index=idx)
- ts[1:20] = np.nan
+ ts[1:5] = np.nan
+ ts[20:30] = np.nan
ts[60:80] = np.nan
ts = ts.cumsum()
@@ -356,6 +357,12 @@ that, by default, performs linear interpolation at missing data points.
ts
ts.count()
+ @savefig series_before_interpolate.png
+ ts.plot()
+
+.. ipython:: python
+
+ ts.interpolate()
ts.interpolate().count()
@savefig series_interpolate.png
@@ -435,9 +442,9 @@ Compare several methods:
np.random.seed(2)
- ser = pd.Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37))
- bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29])
- ser[bad] = np.nan
+ ser = pd.Series(np.arange(1, 10.1, .25) ** 2 + np.random.randn(37))
+ missing = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29])
+ ser[missing] = np.nan
methods = ['linear', 'quadratic', 'cubic']
df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods})
@@ -465,8 +472,8 @@ at the new values.
.. _missing_data.interp_limits:
-Interpolation Limits
-^^^^^^^^^^^^^^^^^^^^
+Interpolation limits
+--------------------
Like other pandas fill methods, :meth:`~DataFrame.interpolate` accepts a ``limit`` keyword
argument. Use this argument to limit the number of consecutive ``NaN`` values
@@ -476,6 +483,7 @@ filled since the last valid observation:
ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan,
np.nan, 13, np.nan, np.nan])
+ ser
# fill all consecutive values in a forward direction
ser.interpolate()
@@ -514,7 +522,7 @@ the ``limit_area`` parameter restricts filling to either inside or outside value
.. _missing_data.replace:
-Replacing Generic Values
+Replacing generic values
~~~~~~~~~~~~~~~~~~~~~~~~
Often times we want to replace arbitrary values with other values.
@@ -559,7 +567,7 @@ missing and interpolate over them:
.. _missing_data.replace_expression:
-String/Regular Expression Replacement
+String/regular expression replacement
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. note::
@@ -597,7 +605,7 @@ list of regex -> list of regex:
.. ipython:: python
- df.replace([r'\.', r'(a)'], ['dot', '\1stuff'], regex=True)
+ df.replace([r'\.', r'(a)'], ['dot', r'\1stuff'], regex=True)
Only search in column ``'b'`` (dict -> dict):
@@ -655,7 +663,7 @@ want to use a regular expression.
Anywhere in the above ``replace`` examples that you see a regular expression
a compiled regular expression is valid as well.
-Numeric Replacement
+Numeric replacement
~~~~~~~~~~~~~~~~~~~
:meth:`~DataFrame.replace` is similar to :meth:`~DataFrame.fillna`.
diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst
index 4b466c2c44d49..f32a8adfd4d33 100644
--- a/doc/source/user_guide/options.rst
+++ b/doc/source/user_guide/options.rst
@@ -3,7 +3,7 @@
{{ header }}
********************
-Options and Settings
+Options and settings
********************
Overview
@@ -68,7 +68,7 @@ with no argument ``describe_option`` will print out the descriptions for all ava
pd.reset_option("all")
-Getting and Setting Options
+Getting and setting options
---------------------------
As described above, :func:`~pandas.get_option` and :func:`~pandas.set_option`
@@ -120,10 +120,10 @@ are restored automatically when you exit the `with` block:
print(pd.get_option("display.max_columns"))
-Setting Startup Options in python/ipython Environment
+Setting startup options in Python/IPython environment
-----------------------------------------------------
-Using startup scripts for the python/ipython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at:
+Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at:
.. code-block:: none
@@ -157,6 +157,22 @@ lines are replaced by an ellipsis.
df
pd.reset_option('max_rows')
+Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options
+determines how many rows are shown in the truncated repr.
+
+.. ipython:: python
+
+ pd.set_option('max_rows', 8)
+ pd.set_option('max_rows', 4)
+ # below max_rows -> all rows shown
+ df = pd.DataFrame(np.random.randn(7, 2))
+ df
+ # above max_rows -> only min_rows (4) rows shown
+ df = pd.DataFrame(np.random.randn(9, 2))
+ df
+ pd.reset_option('max_rows')
+ pd.reset_option('min_rows')
+
``display.expand_frame_repr`` allows for the representation of
dataframes to stretch across pages, wrapped over the full column vs row-wise.
@@ -266,7 +282,7 @@ The options are 'right', and 'left'.
.. _options.available:
-Available Options
+Available options
-----------------
======================================= ============ ==================================
@@ -352,8 +368,12 @@ display.max_rows 60 This sets the maximum numbe
out various output. For example,
this value determines whether the
repr() for a dataframe prints out
- fully or just a summary repr.
+ fully or just a truncated or summary repr.
'None' value means unlimited.
+display.min_rows 10 The numbers of rows to show in a truncated
+ repr (when `max_rows` is exceeded). Ignored
+ when `max_rows` is set to None or 0. When set
+ to None, follows the value of `max_rows`.
display.max_seq_items 100 when pretty-printing a long sequence,
no more then `max_seq_items` will
be printed. If items are omitted,
@@ -431,6 +451,12 @@ compute.use_bottleneck True Use the bottleneck library
computation if it is installed.
compute.use_numexpr True Use the numexpr library to accelerate
computation if it is installed.
+plotting.backend matplotlib Change the plotting backend to a different
+ backend than the current matplotlib one.
+ Backends can be implemented as third-party
+ libraries implementing the pandas plotting
+ API. They can use other plotting libraries
+ like Bokeh, Altair, etc.
plotting.matplotlib.register_converters True Register custom converters with
matplotlib. Set to False to de-register.
======================================= ============ ==================================
@@ -438,7 +464,7 @@ plotting.matplotlib.register_converters True Register custom converters
.. _basics.console_output:
-Number Formatting
+Number formatting
------------------
pandas also allows you to set how numbers are displayed in the console.
@@ -469,7 +495,7 @@ To round floats on a case-by-case basis, you can also use :meth:`~pandas.Series.
.. _options.east_asian_width:
-Unicode Formatting
+Unicode formatting
------------------
.. warning::
@@ -532,7 +558,7 @@ However, setting this option incorrectly for your terminal will cause these char
.. _options.table_schema:
-Table Schema Display
+Table schema display
--------------------
.. versionadded:: 0.20.0
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
index 28bf46cd4c583..dd6d3062a8f0a 100644
--- a/doc/source/user_guide/reshaping.rst
+++ b/doc/source/user_guide/reshaping.rst
@@ -3,7 +3,7 @@
{{ header }}
**************************
-Reshaping and Pivot Tables
+Reshaping and pivot tables
**************************
Reshaping by pivoting DataFrame objects
@@ -186,7 +186,7 @@ removed.
.. _reshaping.stack_multiple:
-Multiple Levels
+Multiple levels
~~~~~~~~~~~~~~~
You may also stack or unstack more than one level at a time by passing a list
@@ -214,7 +214,7 @@ not a mixture of the two).
# from above is equivalent to:
df.stack(level=[1, 2])
-Missing Data
+Missing data
~~~~~~~~~~~~
These functions are intelligent about handling missing data and do not expect
@@ -254,8 +254,6 @@ values will be set to ``NaN``.
df3
df3.unstack()
-.. versionadded:: 0.18.0
-
Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
the value of missing data.
@@ -471,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table.
'C': [1, 1, np.nan, 1, 1]})
df
- pd.crosstab(df.A, df.B)
+ pd.crosstab(df['A'], df['B'])
Any input passed containing ``Categorical`` data will have **all** of its
categories included in the cross-tabulation, even if the actual data does
@@ -486,20 +484,18 @@ not contain any instances of a particular category.
Normalization
~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
Frequency tables can also be normalized to show percentages rather than counts
using the ``normalize`` argument:
.. ipython:: python
- pd.crosstab(df.A, df.B, normalize=True)
+ pd.crosstab(df['A'], df['B'], normalize=True)
``normalize`` can also normalize values within each row or within each column:
.. ipython:: python
- pd.crosstab(df.A, df.B, normalize='columns')
+ pd.crosstab(df['A'], df['B'], normalize='columns')
``crosstab`` can also be passed a third ``Series`` and an aggregation function
(``aggfunc``) that will be applied to the values of the third ``Series`` within
@@ -507,16 +503,16 @@ each group defined by the first two ``Series``:
.. ipython:: python
- pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum)
+ pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum)
-Adding Margins
+Adding margins
~~~~~~~~~~~~~~
Finally, one can also add margins or normalize this output.
.. ipython:: python
- pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True,
+ pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True,
margins=True)
.. _reshaping.tile:
@@ -630,8 +626,6 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways:
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict
-.. versionadded:: 0.18.0
-
Sometimes it will be useful to only keep k-1 levels of a categorical
variable to avoid collinearity when feeding the result to statistical models.
You can switch to this mode by turn on ``drop_first``.
@@ -705,7 +699,7 @@ handling of NaN:
you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or
``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`,
see the :ref:`Categorical introduction ` and the
- :ref:`API documentation `.
+ :ref:`API documentation `.
Examples
--------
@@ -727,7 +721,7 @@ DataFrame will be pivoted in the answers below.
df
-Pivoting with Single Aggregations
+Pivoting with single aggregations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Suppose we wanted to pivot ``df`` such that the ``col`` values are columns,
@@ -775,7 +769,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass
df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size')
-Pivoting with Multiple Aggregations
+Pivoting with multiple aggregations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We can also perform multiple aggregations. For example, to perform both a
@@ -801,3 +795,53 @@ Note to subdivide over multiple columns we can pass in a list to the
df.pivot_table(
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
+
+.. _reshaping.explode:
+
+Exploding a list-like column
+----------------------------
+
+.. versionadded:: 0.25.0
+
+Sometimes the values in a column are list-like.
+
+.. ipython:: python
+
+ keys = ['panda1', 'panda2', 'panda3']
+ values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
+ df = pd.DataFrame({'keys': keys, 'values': values})
+ df
+
+We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row:
+
+.. ipython:: python
+
+ df['values'].explode()
+
+You can also explode the column in the ``DataFrame``.
+
+.. ipython:: python
+
+ df.explode('values')
+
+:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``.
+
+.. ipython:: python
+
+ s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']])
+ s
+ s.explode()
+
+Here is a typical usecase. You have comma separated strings in a column and want to expand this.
+
+.. ipython:: python
+
+ df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
+ {'var1': 'd,e,f', 'var2': 2}])
+ df
+
+Creating a long form DataFrame is now straightforward using explode and chained operations
+
+.. ipython:: python
+
+ df.assign(var1=df.var1.str.split(',')).explode('var1')
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
index 09ed895a847ff..98fd30f67d05b 100644
--- a/doc/source/user_guide/sparse.rst
+++ b/doc/source/user_guide/sparse.rst
@@ -77,6 +77,8 @@ A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.a
np.asarray(sparr)
+.. _sparse.dtype:
+
SparseDtype
-----------
@@ -114,7 +116,7 @@ in many places
.. _sparse.accessor:
-Sparse Accessor
+Sparse accessor
---------------
.. versionadded:: 0.24.0
@@ -140,7 +142,7 @@ See :ref:`api.frame.sparse` for more.
.. _sparse.calculation:
-Sparse Calculation
+Sparse calculation
------------------
You can apply NumPy `ufuncs `_
@@ -237,7 +239,7 @@ Sparse-specific properties, like ``density``, are available on the ``.sparse`` a
df.sparse.density
-**General Differences**
+**General differences**
In a ``SparseDataFrame``, *all* columns were sparse. A :class:`DataFrame` can have a mixture of
sparse and dense columns. As a consequence, assigning new columns to a ``DataFrame`` with sparse
@@ -368,7 +370,7 @@ row and columns coordinates of the matrix. Note that this will consume a signifi
.. _sparse.subclasses:
-Sparse Subclasses
+Sparse subclasses
-----------------
The :class:`SparseSeries` and :class:`SparseDataFrame` classes are deprecated. Visit their
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index 79a9848704eec..006f928c037bd 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -6,10 +6,6 @@
"source": [
"# Styling\n",
"\n",
- "*New in version 0.17.1*\n",
- "\n",
- "*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*\n",
- "\n",
"This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n",
"\n",
"You can apply **conditional formatting**, the visual styling of a DataFrame\n",
@@ -26,7 +22,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Building Styles\n",
+ "## Building styles\n",
"\n",
"Pass your style functions into one of the following methods:\n",
"\n",
@@ -297,7 +293,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Finer Control: Slicing"
+ "## Finer control: slicing"
]
},
{
@@ -410,7 +406,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Builtin Styles"
+ "## Builtin styles"
]
},
{
@@ -612,7 +608,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Sharing Styles"
+ "## Sharing styles"
]
},
{
@@ -754,7 +750,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Table Styles"
+ "### Table styles"
]
},
{
@@ -840,7 +836,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### CSS Classes\n",
+ "### CSS classes\n",
"\n",
"Certain CSS classes are attached to cells.\n",
"\n",
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 87c75e8bcd91f..acb5810e5252a 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -3,7 +3,7 @@
{{ header }}
======================
-Working with Text Data
+Working with text data
======================
.. _text.string_methods:
@@ -81,7 +81,7 @@ and replacing any remaining whitespaces with underscores:
exceptions, other uses are not supported, and may be disabled at a later point.
-Splitting and Replacing Strings
+Splitting and replacing strings
-------------------------------
.. _text.split:
@@ -356,7 +356,7 @@ of the string, the result will be a ``NaN``.
s.str[0]
s.str[1]
-Extracting Substrings
+Extracting substrings
---------------------
.. _text.extract:
@@ -366,13 +366,12 @@ Extract first match in each subject (extract)
.. warning::
- In version 0.18.0, ``extract`` gained the ``expand`` argument. When
- ``expand=False`` it returns a ``Series``, ``Index``, or
+ Before version 0.23, argument ``expand`` of the ``extract`` method defaulted to
+ ``False``. When ``expand=False``, ``expand`` returns a ``Series``, ``Index``, or
``DataFrame``, depending on the subject and regular expression
- pattern (same behavior as pre-0.18.0). When ``expand=True`` it
- always returns a ``DataFrame``, which is more consistent and less
- confusing from the perspective of a user. ``expand=True`` is the
- default since version 0.23.0.
+ pattern. When ``expand=True``, it always returns a ``DataFrame``,
+ which is more consistent and less confusing from the perspective of a user.
+ ``expand=True`` has been the default since version 0.23.0.
The ``extract`` method accepts a `regular expression
`__ with at least one
@@ -468,8 +467,6 @@ Extract all matches in each subject (extractall)
.. _text.extractall:
-.. versionadded:: 0.18.0
-
Unlike ``extract`` (which returns only the first match),
.. ipython:: python
@@ -509,8 +506,6 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as
``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the
same result as a ``Series.str.extractall`` with a default index (starts from 0).
-.. versionadded:: 0.19.0
-
.. ipython:: python
pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
@@ -518,7 +513,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0).
pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
-Testing for Strings that Match or Contain a Pattern
+Testing for Strings that match or contain a pattern
---------------------------------------------------
You can check whether elements contain a pattern:
@@ -547,7 +542,7 @@ an extra ``na`` argument so missing values can be considered True or False:
.. _text.indicator:
-Creating Indicator Variables
+Creating indicator variables
----------------------------
You can extract dummy variables from string columns.
@@ -560,8 +555,6 @@ For example if they are separated by a ``'|'``:
String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
-.. versionadded:: 0.18.1
-
.. ipython:: python
idx = pd.Index(['a', 'a|b', np.nan, 'a|c'])
@@ -569,7 +562,7 @@ String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
See also :func:`~pandas.get_dummies`.
-Method Summary
+Method summary
--------------
.. _text.summary:
diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst
index 40a8fd3101409..3e46140d79b8e 100644
--- a/doc/source/user_guide/timedeltas.rst
+++ b/doc/source/user_guide/timedeltas.rst
@@ -5,7 +5,7 @@
.. _timedeltas.timedeltas:
***********
-Time Deltas
+Time deltas
***********
Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes,
@@ -229,7 +229,7 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob
.. _timedeltas.timedeltas_convert:
-Frequency Conversion
+Frequency conversion
--------------------
Timedelta Series, ``TimedeltaIndex``, and ``Timedelta`` scalars can be converted to other 'frequencies' by dividing by another timedelta,
@@ -360,7 +360,7 @@ inferred frequency upon creation:
pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer')
-Generating Ranges of Time Deltas
+Generating ranges of time deltas
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex``
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index f559b0d073320..0894edd69c2ae 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -3,7 +3,7 @@
{{ header }}
********************************
-Time Series / Date functionality
+Time series / date functionality
********************************
pandas contains extensive capabilities and features for working with time series data for all domains.
@@ -183,7 +183,7 @@ future releases.
.. _timeseries.converting:
-Converting to Timestamps
+Converting to timestamps
------------------------
To convert a :class:`Series` or list-like object of date-like objects e.g. strings,
@@ -235,7 +235,7 @@ inferred frequency upon creation:
pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer')
-Providing a Format Argument
+Providing a format argument
~~~~~~~~~~~~~~~~~~~~~~~~~~~
In addition to the required datetime string, a ``format`` argument can be passed to ensure specific parsing.
@@ -252,11 +252,9 @@ option, see the Python `datetime documentation`_.
.. _datetime documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
-Assembling Datetime from Multiple DataFrame Columns
+Assembling datetime from multiple DataFrame columns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
You can also pass a ``DataFrame`` of integer or string columns to assemble into a ``Series`` of ``Timestamps``.
.. ipython:: python
@@ -279,7 +277,7 @@ You can pass only the columns that you need to assemble.
* required: ``year``, ``month``, ``day``
* optional: ``hour``, ``minute``, ``second``, ``millisecond``, ``microsecond``, ``nanosecond``
-Invalid Data
+Invalid data
~~~~~~~~~~~~
The default behavior, ``errors='raise'``, is to raise when unparseable:
@@ -304,7 +302,7 @@ Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time):
.. _timeseries.converting.epoch:
-Epoch Timestamps
+Epoch timestamps
~~~~~~~~~~~~~~~~
pandas supports converting integer or float epoch times to ``Timestamp`` and
@@ -356,7 +354,7 @@ as timezone-naive timestamps and then localize to the appropriate timezone:
.. _timeseries.converting.epoch_inverse:
-From Timestamps to Epoch
+From timestamps to epoch
~~~~~~~~~~~~~~~~~~~~~~~~
To invert the operation from above, namely, to convert from a ``Timestamp`` to a 'unix' epoch:
@@ -396,7 +394,7 @@ Commonly called 'unix epoch' or POSIX time.
.. _timeseries.daterange:
-Generating Ranges of Timestamps
+Generating ranges of timestamps
-------------------------------
To generate an index with timestamps, you can use either the ``DatetimeIndex`` or
@@ -471,19 +469,9 @@ resulting ``DatetimeIndex``:
.. _timeseries.custom-freq-ranges:
-Custom Frequency Ranges
+Custom frequency ranges
~~~~~~~~~~~~~~~~~~~~~~~
-.. warning::
-
- This functionality was originally exclusive to ``cdate_range``, which is
- deprecated as of version 0.21.0 in favor of ``bdate_range``. Note that
- ``cdate_range`` only utilizes the ``weekmask`` and ``holidays`` parameters
- when custom business day, 'C', is passed as the frequency string. Support has
- been expanded with ``bdate_range`` to work with any custom frequency string.
-
-.. versionadded:: 0.21.0
-
``bdate_range`` can also generate a range of custom frequency dates by using
the ``weekmask`` and ``holidays`` parameters. These parameters will only be
used if a custom frequency string is passed.
@@ -504,7 +492,7 @@ used if a custom frequency string is passed.
.. _timeseries.timestamp-limits:
-Timestamp Limitations
+Timestamp limitations
---------------------
Since pandas represents timestamps in nanosecond resolution, the time span that
@@ -561,7 +549,7 @@ intelligent functionality like selection, slicing, etc.
.. _timeseries.partialindexing:
-Partial String Indexing
+Partial string indexing
~~~~~~~~~~~~~~~~~~~~~~~
Dates and strings that parse to timestamps can be passed as indexing parameters:
@@ -619,8 +607,6 @@ We are stopping on the included end-point as it is part of the index:
dft['2013-1-15':'2013-1-15 12:30:00']
-.. versionadded:: 0.18.0
-
``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``:
.. ipython:: python
@@ -648,7 +634,7 @@ Slicing with string indexing also honors UTC offset.
.. _timeseries.slice_vs_exact_match:
-Slice vs. Exact Match
+Slice vs. exact match
~~~~~~~~~~~~~~~~~~~~~
.. versionchanged:: 0.20.0
@@ -719,7 +705,7 @@ Note also that ``DatetimeIndex`` resolution cannot be less precise than day.
series_monthly['2011-12'] # returns Series
-Exact Indexing
+Exact indexing
~~~~~~~~~~~~~~
As discussed in previous section, indexing a ``DatetimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*.
@@ -738,7 +724,7 @@ With no defaults.
datetime.datetime(2013, 2, 28, 10, 12, 0)]
-Truncating & Fancy Indexing
+Truncating & fancy indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~
A :meth:`~DataFrame.truncate` convenience function is provided that is similar
@@ -761,37 +747,9 @@ regularity will result in a ``DatetimeIndex``, although frequency is lost:
ts2[[0, 2, 6]].index
-.. _timeseries.iterating-label:
-
-Iterating through groups
-------------------------
-
-With the ``Resampler`` object in hand, iterating through the grouped data is very
-natural and functions similarly to :py:func:`itertools.groupby`:
-
-.. ipython:: python
-
- small = pd.Series(
- range(6),
- index=pd.to_datetime(['2017-01-01T00:00:00',
- '2017-01-01T00:30:00',
- '2017-01-01T00:31:00',
- '2017-01-01T01:00:00',
- '2017-01-01T03:00:00',
- '2017-01-01T03:05:00'])
- )
- resampled = small.resample('H')
-
- for name, group in resampled:
- print("Group: ", name)
- print("-" * 27)
- print(group, end="\n\n")
-
-See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more.
-
.. _timeseries.components:
-Time/Date Components
+Time/date components
--------------------
There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DatetimeIndex``.
@@ -833,7 +791,7 @@ on :ref:`.dt accessors`.
.. _timeseries.offsets:
-DateOffset Objects
+DateOffset objects
------------------
In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify
@@ -950,7 +908,7 @@ in the operation).
.. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html
-Parametric Offsets
+Parametric offsets
~~~~~~~~~~~~~~~~~~
Some of the offsets can be "parameterized" when created to result in different
@@ -986,7 +944,7 @@ Another example is parameterizing ``YearEnd`` with the specific ending month:
.. _timeseries.offsetseries:
-Using Offsets with ``Series`` / ``DatetimeIndex``
+Using offsets with ``Series`` / ``DatetimeIndex``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Offsets can be used with either a ``Series`` or ``DatetimeIndex`` to
@@ -1025,7 +983,7 @@ calculate significantly slower and will show a ``PerformanceWarning``
.. _timeseries.custombusinessdays:
-Custom Business Days
+Custom business days
~~~~~~~~~~~~~~~~~~~~
The ``CDay`` or ``CustomBusinessDay`` class provides a parametric
@@ -1099,7 +1057,7 @@ in the usual way.
.. _timeseries.businesshour:
-Business Hour
+Business hour
~~~~~~~~~~~~~
The ``BusinessHour`` class provides a business hour representation on ``BusinessDay``,
@@ -1161,7 +1119,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine
pd.Timestamp('2014-08-01 17:00') + bh
pd.Timestamp('2014-08-01 23:00') + bh
- # Although 2014-08-02 is Satuaday,
+ # Although 2014-08-02 is Saturday,
# it is valid because it starts from 08-01 (Friday).
pd.Timestamp('2014-08-02 04:00') + bh
@@ -1200,11 +1158,9 @@ following subsection.
.. _timeseries.custombusinesshour:
-Custom Business Hour
+Custom business hour
~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
The ``CustomBusinessHour`` is a mixture of ``BusinessHour`` and ``CustomBusinessDay`` which
allows you to specify arbitrary holidays. ``CustomBusinessHour`` works as the same
as ``BusinessHour`` except that it skips specified custom holidays.
@@ -1233,7 +1189,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB
.. _timeseries.offset_aliases:
-Offset Aliases
+Offset aliases
~~~~~~~~~~~~~~
A number of string aliases are given to useful common time series
@@ -1271,7 +1227,7 @@ frequencies. We will refer to these aliases as *offset aliases*.
"U, us", "microseconds"
"N", "nanoseconds"
-Combining Aliases
+Combining aliases
~~~~~~~~~~~~~~~~~
As we have seen previously, the alias and the offset instance are fungible in
@@ -1291,7 +1247,7 @@ You can combine together day and intraday offsets:
pd.date_range(start, periods=10, freq='1D10U')
-Anchored Offsets
+Anchored offsets
~~~~~~~~~~~~~~~~
For some frequencies you can specify an anchoring suffix:
@@ -1336,7 +1292,7 @@ These can be used as arguments to ``date_range``, ``bdate_range``, constructors
for ``DatetimeIndex``, as well as various other timeseries-related functions
in pandas.
-Anchored Offset Semantics
+Anchored offset semantics
~~~~~~~~~~~~~~~~~~~~~~~~~
For those offsets that are anchored to the start or end of specific
@@ -1384,7 +1340,7 @@ it is rolled forward to the next anchor point.
.. _timeseries.holiday:
-Holidays / Holiday Calendars
+Holidays / holiday calendars
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Holidays and calendars provide a simple way to define holiday rules to be used
@@ -1484,7 +1440,7 @@ or calendars with additional rules.
Time Series-Related Instance Methods
------------------------------------
-Shifting / Lagging
+Shifting / lagging
~~~~~~~~~~~~~~~~~~
One may want to *shift* or *lag* the values in a time series back and forward in
@@ -1517,7 +1473,7 @@ changes all the dates in the index by a specified number of offsets:
Note that with ``tshift``, the leading entry is no longer NaN because the data
is not being realigned.
-Frequency Conversion
+Frequency conversion
~~~~~~~~~~~~~~~~~~~~
The primary function for changing frequencies is the :meth:`~Series.asfreq`
@@ -1539,13 +1495,13 @@ method for any gaps that may appear after the frequency conversion.
ts.asfreq(pd.offsets.BDay(), method='pad')
-Filling Forward / Backward
+Filling forward / backward
~~~~~~~~~~~~~~~~~~~~~~~~~~
Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is
documented in the :ref:`missing data section `.
-Converting to Python Datetimes
+Converting to Python datetimes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``DatetimeIndex`` can be converted to an array of Python native
@@ -1556,11 +1512,6 @@ Converting to Python Datetimes
Resampling
----------
-.. warning::
-
- The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible.
- See the :ref:`whatsnew docs ` for a comparison with prior versions.
-
Pandas has a simple, powerful, and efficient functionality for performing
resampling operations during frequency conversion (e.g., converting secondly
data into 5-minutely data). This is extremely common in, but not limited to,
@@ -1570,8 +1521,8 @@ financial applications.
on each of its groups. See some :ref:`cookbook examples ` for
some advanced strategies.
-Starting in version 0.18.1, the ``resample()`` function can be used directly from
-``DataFrameGroupBy`` objects, see the :ref:`groupby docs `.
+The ``resample()`` method can be used directly from ``DataFrameGroupBy`` objects,
+see the :ref:`groupby docs `.
.. note::
@@ -1628,24 +1579,32 @@ labels.
ts.resample('5Min', label='left', loffset='1s').mean()
-.. note::
+.. warning::
- The default values for ``label`` and ``closed`` is 'left' for all
+ The default values for ``label`` and ``closed`` is '**left**' for all
frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W'
which all have a default of 'right'.
+ This might unintendedly lead to looking ahead, where the value for a later
+ time is pulled back to a previous time as in the following example with
+ the :class:`~pandas.tseries.offsets.BusinessDay` frequency:
+
.. ipython:: python
- rng2 = pd.date_range('1/1/2012', end='3/31/2012', freq='D')
- ts2 = pd.Series(range(len(rng2)), index=rng2)
+ s = pd.date_range('2000-01-01', '2000-01-05').to_series()
+ s.iloc[2] = pd.NaT
+ s.dt.weekday_name
- # default: label='right', closed='right'
- ts2.resample('M').max()
+ # default: label='left', closed='left'
+ s.resample('B').last().dt.weekday_name
- # default: label='left', closed='left'
- ts2.resample('SM').max()
+ Notice how the value for Sunday got pulled back to the previous Friday.
+ To get the behavior where the value for Sunday is pushed to Monday, use
+ instead
- ts2.resample('SM', label='right', closed='right').max()
+ .. ipython:: python
+
+ s.resample('B', label='right', closed='right').last().dt.weekday_name
The ``axis`` parameter can be set to 0 or 1 and allows you to resample the
specified axis for a ``DataFrame``.
@@ -1674,7 +1633,7 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to
ts[:2].resample('250L').ffill(limit=2)
-Sparse Resampling
+Sparse resampling
~~~~~~~~~~~~~~~~~
Sparse timeseries are the ones where you have a lot fewer points relative
@@ -1796,10 +1755,38 @@ level of ``MultiIndex``, its name or location can be passed to the
df.resample('M', level='d').sum()
+.. _timeseries.iterating-label:
+
+Iterating through groups
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the ``Resampler`` object in hand, iterating through the grouped data is very
+natural and functions similarly to :py:func:`itertools.groupby`:
+
+.. ipython:: python
+
+ small = pd.Series(
+ range(6),
+ index=pd.to_datetime(['2017-01-01T00:00:00',
+ '2017-01-01T00:30:00',
+ '2017-01-01T00:31:00',
+ '2017-01-01T01:00:00',
+ '2017-01-01T03:00:00',
+ '2017-01-01T03:05:00'])
+ )
+ resampled = small.resample('H')
+
+ for name, group in resampled:
+ print("Group: ", name)
+ print("-" * 27)
+ print(group, end="\n\n")
+
+See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more.
+
.. _timeseries.periods:
-Time Span Representation
+Time span representation
------------------------
Regular intervals of time are represented by ``Period`` objects in pandas while
@@ -1931,11 +1918,9 @@ objects:
.. _timeseries.period_dtype:
-Period Dtypes
+Period dtypes
~~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
``PeriodIndex`` has a custom ``period`` dtype. This is a pandas extension
dtype similar to the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``).
@@ -1966,7 +1951,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
dti.astype('period[M]')
-PeriodIndex Partial String Indexing
+PeriodIndex partial string indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `.
@@ -1999,7 +1984,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa
dfp['2013-01-01 10H':'2013-01-01 11H']
-Frequency Conversion and Resampling with PeriodIndex
+Frequency conversion and resampling with PeriodIndex
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq``
method. Let's start with the fiscal year 2011, ending in December:
@@ -2070,7 +2055,7 @@ frequencies ``Q-JAN`` through ``Q-DEC``.
.. _timeseries.interchange:
-Converting Between Representations
+Converting between representations
----------------------------------
Timestamped data can be converted to PeriodIndex-ed data using ``to_period``
@@ -2114,7 +2099,7 @@ the quarter end:
.. _timeseries.oob:
-Representing Out-of-Bounds Spans
+Representing out-of-bounds spans
--------------------------------
If you have data that is outside of the ``Timestamp`` bounds, see :ref:`Timestamp limitations `,
@@ -2148,7 +2133,7 @@ These can easily be converted to a ``PeriodIndex``:
.. _timeseries.timezone:
-Time Zone Handling
+Time zone handling
------------------
pandas provides rich support for working with timestamps in different time
@@ -2156,7 +2141,7 @@ zones using the ``pytz`` and ``dateutil`` libraries or class:`datetime.timezone`
objects from the standard library.
-Working with Time Zones
+Working with time zones
~~~~~~~~~~~~~~~~~~~~~~~
By default, pandas objects are time zone unaware:
@@ -2312,7 +2297,7 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None)
.. _timeseries.timezone_ambiguous:
-Ambiguous Times when Localizing
+Ambiguous times when localizing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``tz_localize`` may not be able to determine the UTC offset of a timestamp
@@ -2346,7 +2331,7 @@ Handle these ambiguous times by specifying the following.
.. _timeseries.timezone_nonexistent:
-Nonexistent Times when Localizing
+Nonexistent times when localizing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A DST transition may also shift the local time ahead by 1 hour creating nonexistent
@@ -2384,7 +2369,7 @@ Transform nonexistent times to ``NaT`` or shift the times.
.. _timeseries.timezone_series:
-Time Zone Series Operations
+Time zone series operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
A :class:`Series` with time zone **naive** values is
diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
index 2448d0e5d9930..fa16b2f216610 100644
--- a/doc/source/user_guide/visualization.rst
+++ b/doc/source/user_guide/visualization.rst
@@ -23,7 +23,7 @@ libraries that go beyond the basics documented here.
.. _visualization.basic:
-Basic Plotting: ``plot``
+Basic plotting: ``plot``
------------------------
We will demonstrate the basics, see the :ref:`cookbook` for
@@ -97,7 +97,7 @@ You can plot one column versus another using the `x` and `y` keywords in
.. _visualization.other:
-Other Plots
+Other plots
-----------
Plotting methods allow for a handful of plot styles other than the
@@ -311,7 +311,7 @@ The ``by`` keyword can be specified to plot grouped histograms:
.. _visualization.box:
-Box Plots
+Box plots
~~~~~~~~~
Boxplot can be drawn calling :meth:`Series.plot.box` and :meth:`DataFrame.plot.box`,
@@ -438,10 +438,6 @@ columns:
.. _visualization.box.return:
-.. warning::
-
- The default changed from ``'dict'`` to ``'axes'`` in version 0.19.0.
-
In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``.
Faceting, created by ``DataFrame.boxplot`` with the ``by``
keyword, will affect the output type as well:
@@ -495,7 +491,7 @@ then by the numeric columns.
.. _visualization.area_plot:
-Area Plot
+Area plot
~~~~~~~~~
You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`.
@@ -531,7 +527,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5
.. _visualization.scatter:
-Scatter Plot
+Scatter plot
~~~~~~~~~~~~
Scatter plot can be drawn by using the :meth:`DataFrame.plot.scatter` method.
@@ -599,7 +595,7 @@ See the :meth:`scatter ` method and the
.. _visualization.hexbin:
-Hexagonal Bin Plot
+Hexagonal bin plot
~~~~~~~~~~~~~~~~~~
You can create hexagonal bin plots with :meth:`DataFrame.plot.hexbin`.
@@ -762,7 +758,7 @@ See the `matplotlib pie documentation `). These can be used
to control additional styling, beyond what pandas provides.
-Controlling the Legend
+Controlling the legend
~~~~~~~~~~~~~~~~~~~~~~
You may set the ``legend`` argument to ``False`` to hide the legend, which is
@@ -1140,7 +1136,7 @@ You may pass ``logy`` to get a log-scale Y axis.
See also the ``logx`` and ``loglog`` keyword arguments.
-Plotting on a Secondary Y-axis
+Plotting on a secondary y-axis
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To plot data on a secondary y-axis, use the ``secondary_y`` keyword:
@@ -1152,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword:
.. ipython:: python
- df.A.plot()
+ df['A'].plot()
@savefig series_plot_secondary_y.png
- df.B.plot(secondary_y=True, style='g')
+ df['B'].plot(secondary_y=True, style='g')
.. ipython:: python
:suppress:
@@ -1194,7 +1190,7 @@ with "(right)" in the legend. To turn off the automatic marking, use the
plt.close('all')
-Suppressing Tick Resolution Adjustment
+Suppressing tick resolution adjustment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pandas includes automatic tick resolution adjustment for regular frequency
@@ -1209,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed:
plt.figure()
@savefig ser_plot_suppress.png
- df.A.plot()
+ df['A'].plot()
.. ipython:: python
:suppress:
@@ -1223,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior:
plt.figure()
@savefig ser_plot_suppress_parm.png
- df.A.plot(x_compat=True)
+ df['A'].plot(x_compat=True)
.. ipython:: python
:suppress:
@@ -1239,16 +1235,16 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`:
@savefig ser_plot_suppress_context.png
with pd.plotting.plot_params.use('x_compat', True):
- df.A.plot(color='r')
- df.B.plot(color='g')
- df.C.plot(color='b')
+ df['A'].plot(color='r')
+ df['B'].plot(color='g')
+ df['C'].plot(color='b')
.. ipython:: python
:suppress:
plt.close('all')
-Automatic Date Tick Adjustment
+Automatic date tick adjustment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.20.0
@@ -1276,7 +1272,7 @@ with the ``subplots`` keyword:
plt.close('all')
-Using Layout and Targeting Multiple Axes
+Using layout and targeting multiple axes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The layout of subplots can be specified by the ``layout`` keyword. It can accept
@@ -1377,7 +1373,7 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a
.. _visualization.errorbars:
-Plotting With Error Bars
+Plotting with error bars
~~~~~~~~~~~~~~~~~~~~~~~~
Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`.
@@ -1423,7 +1419,7 @@ Here is an example of one way to easily plot group means with standard deviation
.. _visualization.table:
-Plotting Tables
+Plotting tables
~~~~~~~~~~~~~~~
Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and :meth:`Series.plot` with a ``table`` keyword. The ``table`` keyword can accept ``bool``, :class:`DataFrame` or :class:`Series`. The simple way to draw a table is to specify ``table=True``. Data will be transposed to meet matplotlib's default layout.
@@ -1632,18 +1628,3 @@ when plotting a large number of points.
:suppress:
plt.close('all')
-
-
-.. _rplot:
-
-
-Trellis plotting interface
---------------------------
-
-.. warning::
-
- The ``rplot`` trellis plotting interface has been **removed**. Please use
- external packages like `seaborn `_ for
- similar but more refined functionality and refer to our 0.18.1 documentation
- `here `__
- for how to convert to using it.
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 6c529d2e2e5f3..fe80cc8bb959a 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,6 +10,24 @@ This is the list of changes to pandas between each release. For full details,
see the commit logs at http://github.com/pandas-dev/pandas. For install and
upgrade instructions, see :ref:`install`.
+Version 1.0
+-----------
+
+.. toctree::
+ :maxdepth: 2
+
+ v1.0.0
+
+Version 0.25
+------------
+
+.. toctree::
+ :maxdepth: 2
+
+ v0.25.2
+ v0.25.1
+ v0.25.0
+
Version 0.24
------------
diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst
index 9d497f2fc658d..2e0442364b2f3 100644
--- a/doc/source/whatsnew/v0.10.0.rst
+++ b/doc/source/whatsnew/v0.10.0.rst
@@ -255,7 +255,7 @@ Convenience methods ``ffill`` and ``bfill`` have been added:
New features
~~~~~~~~~~~~
-Wide DataFrame Printing
+Wide DataFrame printing
~~~~~~~~~~~~~~~~~~~~~~~
Instead of printing the summary information, pandas now splits the string
@@ -290,7 +290,7 @@ The width of each line can be changed via 'line_width' (80 by default):
wide_frame
-Updated PyTables Support
+Updated PyTables support
~~~~~~~~~~~~~~~~~~~~~~~~
:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect.
@@ -490,7 +490,7 @@ Updated PyTables Support
however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire
file and write it out using the new format to take advantage of the updates.
-N Dimensional Panels (Experimental)
+N dimensional Panels (experimental)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Adding experimental support for Panel4D and factory functions to create n-dimensional named panels.
@@ -498,7 +498,7 @@ Here is a taste of what to expect.
.. code-block:: ipython
- In [58]: p4d = Panel4D(randn(2, 2, 5, 4),
+ In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4),
....: labels=['Label1','Label2'],
....: items=['Item1', 'Item2'],
....: major_axis=date_range('1/1/2000', periods=5),
diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst
index b5b2b889732cd..c4251f70d85b6 100644
--- a/doc/source/whatsnew/v0.10.1.rst
+++ b/doc/source/whatsnew/v0.10.1.rst
@@ -89,7 +89,7 @@ You can now store ``datetime64`` in data columns
store.append('df_mixed', df_mixed)
df_mixed1 = store.select('df_mixed')
df_mixed1
- df_mixed1.get_dtype_counts()
+ df_mixed1.dtypes.value_counts()
You can pass ``columns`` keyword to select to filter a list of the return
columns, this is equivalent to passing a
@@ -170,7 +170,7 @@ combined result, by using ``where`` on a selector table.
df_mt, selector='df1_mt')
store
- # indiviual tables were created
+ # individual tables were created
store.select('df1_mt')
store.select('df2_mt')
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
index c919698d15689..148ee349b049c 100644
--- a/doc/source/whatsnew/v0.11.0.rst
+++ b/doc/source/whatsnew/v0.11.0.rst
@@ -20,7 +20,7 @@ of useful recipes in pandas (and that we want contributions!).
There are several libraries that are now :ref:`Recommended Dependencies `
-Selection Choices
+Selection choices
~~~~~~~~~~~~~~~~~
Starting in 0.11.0, object selection has had a number of user-requested additions in
@@ -56,7 +56,7 @@ three types of multi-axis indexing.
See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `.
-Selection Deprecations
+Selection deprecations
~~~~~~~~~~~~~~~~~~~~~~
Starting in version 0.11.0, these methods *may* be deprecated in future versions.
@@ -88,7 +88,7 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe
df3
df3.dtypes
-Dtype Conversion
+Dtype conversion
~~~~~~~~~~~~~~~~
This is lower-common-denominator upcasting, meaning you get the dtype which can accommodate all of the types
@@ -103,34 +103,61 @@ Conversion
df3.astype('float32').dtypes
-Mixed Conversion
+Mixed conversion
-.. ipython:: python
- :okwarning:
+.. code-block:: ipython
- df3['D'] = '1.'
- df3['E'] = '1'
- df3.convert_objects(convert_numeric=True).dtypes
+ In [12]: df3['D'] = '1.'
- # same, but specific dtype conversion
- df3['D'] = df3['D'].astype('float16')
- df3['E'] = df3['E'].astype('int32')
- df3.dtypes
+ In [13]: df3['E'] = '1'
-Forcing Date coercion (and setting ``NaT`` when not datelike)
+ In [14]: df3.convert_objects(convert_numeric=True).dtypes
+ Out[14]:
+ A float32
+ B float64
+ C float64
+ D float64
+ E int64
+ dtype: object
-.. ipython:: python
- :okwarning:
+ # same, but specific dtype conversion
+ In [15]: df3['D'] = df3['D'].astype('float16')
+
+ In [16]: df3['E'] = df3['E'].astype('int32')
+
+ In [17]: df3.dtypes
+ Out[17]:
+ A float32
+ B float64
+ C float64
+ D float16
+ E int32
+ dtype: object
- import datetime
- s = pd.Series([datetime.datetime(2001, 1, 1, 0, 0), 'foo', 1.0, 1,
- pd.Timestamp('20010104'), '20010105'], dtype='O')
- s.convert_objects(convert_dates='coerce')
+Forcing date coercion (and setting ``NaT`` when not datelike)
-Dtype Gotchas
+.. code-block:: ipython
+
+ In [18]: import datetime
+
+ In [19]: s = pd.Series([datetime.datetime(2001, 1, 1, 0, 0), 'foo', 1.0, 1,
+ ....: pd.Timestamp('20010104'), '20010105'], dtype='O')
+ ....:
+
+ In [20]: s.convert_objects(convert_dates='coerce')
+ Out[20]:
+ 0 2001-01-01
+ 1 NaT
+ 2 NaT
+ 3 NaT
+ 4 2001-01-04
+ 5 2001-01-05
+ dtype: datetime64[ns]
+
+Dtype gotchas
~~~~~~~~~~~~~
-**Platform Gotchas**
+**Platform gotchas**
Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``,
*regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify
@@ -138,44 +165,122 @@ dtypes, they *WILL* be respected, however (:issue:`2837`)
The following will all result in ``int64`` dtypes
-.. ipython:: python
+.. code-block:: ipython
- pd.DataFrame([1, 2], columns=['a']).dtypes
- pd.DataFrame({'a': [1, 2]}).dtypes
- pd.DataFrame({'a': 1}, index=range(2)).dtypes
+ In [21]: pd.DataFrame([1, 2], columns=['a']).dtypes
+ Out[21]:
+ a int64
+ dtype: object
+
+ In [22]: pd.DataFrame({'a': [1, 2]}).dtypes
+ Out[22]:
+ a int64
+ dtype: object
+
+ In [23]: pd.DataFrame({'a': 1}, index=range(2)).dtypes
+ Out[23]:
+ a int64
+ dtype: object
Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms!
-**Upcasting Gotchas**
+**Upcasting gotchas**
Performing indexing operations on integer type data can easily upcast the data.
The dtype of the input data will be preserved in cases where ``nans`` are not introduced.
-.. ipython:: python
-
- dfi = df3.astype('int32')
- dfi['D'] = dfi['D'].astype('int64')
- dfi
- dfi.dtypes
-
- casted = dfi[dfi > 0]
- casted
- casted.dtypes
+.. code-block:: ipython
+
+ In [24]: dfi = df3.astype('int32')
+
+ In [25]: dfi['D'] = dfi['D'].astype('int64')
+
+ In [26]: dfi
+ Out[26]:
+ A B C D E
+ 0 0 0 0 1 1
+ 1 -2 0 1 1 1
+ 2 -2 0 2 1 1
+ 3 0 -1 3 1 1
+ 4 1 0 4 1 1
+ 5 0 0 5 1 1
+ 6 0 -1 6 1 1
+ 7 0 0 7 1 1
+
+ In [27]: dfi.dtypes
+ Out[27]:
+ A int32
+ B int32
+ C int32
+ D int64
+ E int32
+ dtype: object
+
+ In [28]: casted = dfi[dfi > 0]
+
+ In [29]: casted
+ Out[29]:
+ A B C D E
+ 0 NaN NaN NaN 1 1
+ 1 NaN NaN 1.0 1 1
+ 2 NaN NaN 2.0 1 1
+ 3 NaN NaN 3.0 1 1
+ 4 1.0 NaN 4.0 1 1
+ 5 NaN NaN 5.0 1 1
+ 6 NaN NaN 6.0 1 1
+ 7 NaN NaN 7.0 1 1
+
+ In [30]: casted.dtypes
+ Out[30]:
+ A float64
+ B float64
+ C float64
+ D int64
+ E int32
+ dtype: object
While float dtypes are unchanged.
-.. ipython:: python
-
- df4 = df3.copy()
- df4['A'] = df4['A'].astype('float32')
- df4.dtypes
-
- casted = df4[df4 > 0]
- casted
- casted.dtypes
-
-Datetimes Conversion
+.. code-block:: ipython
+
+ In [31]: df4 = df3.copy()
+
+ In [32]: df4['A'] = df4['A'].astype('float32')
+
+ In [33]: df4.dtypes
+ Out[33]:
+ A float32
+ B float64
+ C float64
+ D float16
+ E int32
+ dtype: object
+
+ In [34]: casted = df4[df4 > 0]
+
+ In [35]: casted
+ Out[35]:
+ A B C D E
+ 0 NaN NaN NaN 1.0 1
+ 1 NaN 0.567020 1.0 1.0 1
+ 2 NaN 0.276232 2.0 1.0 1
+ 3 NaN NaN 3.0 1.0 1
+ 4 1.933792 NaN 4.0 1.0 1
+ 5 NaN 0.113648 5.0 1.0 1
+ 6 NaN NaN 6.0 1.0 1
+ 7 NaN 0.524988 7.0 1.0 1
+
+ In [36]: casted.dtypes
+ Out[36]:
+ A float32
+ B float64
+ C float64
+ D float16
+ E int32
+ dtype: object
+
+Datetimes conversion
~~~~~~~~~~~~~~~~~~~~
Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value,
@@ -191,7 +296,7 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet
df
# datetime64[ns] out of the box
- df.get_dtype_counts()
+ df.dtypes.value_counts()
# use the traditional nan, which is mapped to NaT internally
df.loc[df.index[2:4], ['A', 'timestamp']] = np.nan
@@ -272,15 +377,31 @@ Enhancements
- ``Squeeze`` to possibly remove length 1 dimensions from an object.
- .. ipython:: python
- :okwarning:
-
- p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'],
- major_axis=pd.date_range('20010102', periods=4),
- minor_axis=['A', 'B', 'C', 'D'])
- p
- p.reindex(items=['ItemA']).squeeze()
- p.reindex(items=['ItemA'], minor=['B']).squeeze()
+ .. code-block:: python
+
+ >>> p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'],
+ ... major_axis=pd.date_range('20010102', periods=4),
+ ... minor_axis=['A', 'B', 'C', 'D'])
+ >>> p
+
+ Dimensions: 3 (items) x 4 (major_axis) x 4 (minor_axis)
+ Items axis: ItemA to ItemC
+ Major_axis axis: 2001-01-02 00:00:00 to 2001-01-05 00:00:00
+ Minor_axis axis: A to D
+
+ >>> p.reindex(items=['ItemA']).squeeze()
+ A B C D
+ 2001-01-02 0.926089 -2.026458 0.501277 -0.204683
+ 2001-01-03 -0.076524 1.081161 1.141361 0.479243
+ 2001-01-04 0.641817 -0.185352 1.824568 0.809152
+ 2001-01-05 0.575237 0.669934 1.398014 -0.399338
+
+ >>> p.reindex(items=['ItemA'], minor=['B']).squeeze()
+ 2001-01-02 -2.026458
+ 2001-01-03 1.081161
+ 2001-01-04 -0.185352
+ 2001-01-05 0.669934
+ Freq: D, Name: B, dtype: float64
- In ``pd.io.data.Options``,
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index ff549f10a97c3..0a74d67486715 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -177,7 +177,7 @@ API changes
``__repr__``). Plus string safety throughout. Now employed in many places
throughout the pandas library. (:issue:`4090`, :issue:`4092`)
-I/O Enhancements
+I/O enhancements
~~~~~~~~~~~~~~~~
- ``pd.read_html()`` can now parse HTML strings, files or urls and return
@@ -282,7 +282,7 @@ I/O Enhancements
- ``read_csv`` will now throw a more informative error message when a file
contains no columns, e.g., all newline characters
-Other Enhancements
+Other enhancements
~~~~~~~~~~~~~~~~~~
- ``DataFrame.replace()`` now allows regular expressions on contained
@@ -371,7 +371,7 @@ Other Enhancements
is detected (:issue:`4214`)
-Experimental Features
+Experimental features
~~~~~~~~~~~~~~~~~~~~~
- Added experimental ``CustomBusinessDay`` class to support ``DateOffsets``
@@ -398,7 +398,7 @@ Experimental Features
dts = pd.date_range(dt, periods=5, freq=bday_egypt)
print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())))
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Plotting functions now raise a ``TypeError`` before trying to plot anything
diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst
index 13a2f879211b3..ab48594ddadab 100644
--- a/doc/source/whatsnew/v0.13.0.rst
+++ b/doc/source/whatsnew/v0.13.0.rst
@@ -203,7 +203,7 @@ API changes
- ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. These return the *index* of the
min or max element respectively. Prior to 0.13.0 these would return the position of the min / max element. (:issue:`6214`)
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These were announced changes in 0.12 or prior that are taking effect as of 0.13.0
@@ -234,7 +234,7 @@ Deprecated in 0.13.0
behavior is the default, but the new behavior is available through the
keyword argument ``as_indexer=True``.
-Indexing API Changes
+Indexing API changes
~~~~~~~~~~~~~~~~~~~~
Prior to 0.13, it was impossible to use a label indexer (``.loc/.ix``) to set a value that
@@ -271,19 +271,41 @@ This is like an ``append`` operation.
A Panel setting operation on an arbitrary axis aligns the input to the Panel
-.. ipython:: python
- :okwarning:
-
- p = pd.Panel(np.arange(16).reshape(2, 4, 2),
- items=['Item1', 'Item2'],
- major_axis=pd.date_range('2001/1/12', periods=4),
- minor_axis=['A', 'B'], dtype='float64')
- p
- p.loc[:, :, 'C'] = pd.Series([30, 32], index=p.items)
- p
- p.loc[:, :, 'C']
-
-Float64Index API Change
+.. code-block:: ipython
+
+ In [20]: p = pd.Panel(np.arange(16).reshape(2, 4, 2),
+ ....: items=['Item1', 'Item2'],
+ ....: major_axis=pd.date_range('2001/1/12', periods=4),
+ ....: minor_axis=['A', 'B'], dtype='float64')
+ ....:
+
+ In [21]: p
+ Out[21]:
+
+ Dimensions: 2 (items) x 4 (major_axis) x 2 (minor_axis)
+ Items axis: Item1 to Item2
+ Major_axis axis: 2001-01-12 00:00:00 to 2001-01-15 00:00:00
+ Minor_axis axis: A to B
+
+ In [22]: p.loc[:, :, 'C'] = pd.Series([30, 32], index=p.items)
+
+ In [23]: p
+ Out[23]:
+
+ Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
+ Items axis: Item1 to Item2
+ Major_axis axis: 2001-01-12 00:00:00 to 2001-01-15 00:00:00
+ Minor_axis axis: A to C
+
+ In [24]: p.loc[:, :, 'C']
+ Out[24]:
+ Item1 Item2
+ 2001-01-12 30.0 32.0
+ 2001-01-13 30.0 32.0
+ 2001-01-14 30.0 32.0
+ 2001-01-15 30.0 32.0
+
+Float64Index API change
~~~~~~~~~~~~~~~~~~~~~~~
- Added a new index type, ``Float64Index``. This will be automatically created when passing floating values in index creation.
@@ -347,7 +369,7 @@ Float64Index API Change
In [3]: pd.Series(range(5))[3.0]
Out[3]: 3
-HDFStore API Changes
+HDFStore API changes
~~~~~~~~~~~~~~~~~~~~
- Query Format Changes. A much more string-like query format is now supported. See :ref:`the docs`.
@@ -446,7 +468,7 @@ HDFStore API Changes
via the option ``io.hdf.dropna_table`` (:issue:`4625`)
- pass through store creation arguments; can be used to support in-memory stores
-DataFrame repr Changes
+DataFrame repr changes
~~~~~~~~~~~~~~~~~~~~~~
The HTML and plain text representations of :class:`DataFrame` now show
@@ -807,6 +829,7 @@ Experimental
Since this is an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release.
.. ipython:: python
+ :okwarning:
df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
df.to_msgpack('foo.msg')
@@ -819,6 +842,7 @@ Experimental
You can pass ``iterator=True`` to iterator over the unpacked results
.. ipython:: python
+ :okwarning:
for o in pd.read_msgpack('foo.msg', iterator=True):
print(o)
@@ -893,7 +917,7 @@ Experimental
.. _whatsnew_0130.refactoring:
-Internal Refactoring
+Internal refactoring
~~~~~~~~~~~~~~~~~~~~
In 0.13.0 there is a major refactor primarily to subclass ``Series`` from
@@ -1008,7 +1032,7 @@ to unify methods and behaviors. Series formerly subclassed directly from
.. _release.bug_fixes-0.13.0:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``HDFStore``
diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst
index 161b0ef395f05..6242c40d44bf8 100644
--- a/doc/source/whatsnew/v0.13.1.rst
+++ b/doc/source/whatsnew/v0.13.1.rst
@@ -43,7 +43,7 @@ Highlights include:
df.loc[0, 'A'] = np.nan
df
-Output Formatting Enhancements
+Output formatting enhancements
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- df.info() view now display dtype info per column (:issue:`5682`)
@@ -179,7 +179,7 @@ API changes
[0 rows x 2 columns]
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are no announced changes in 0.13 or prior that are taking effect as of 0.13.1
@@ -394,7 +394,7 @@ There are no experimental changes in 0.13.1
.. _release.bug_fixes-0.13.1:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`)
diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
index d61b9a40438f8..25a75492d78fb 100644
--- a/doc/source/whatsnew/v0.14.0.rst
+++ b/doc/source/whatsnew/v0.14.0.rst
@@ -245,7 +245,7 @@ API changes
.. _whatsnew_0140.display:
-Display Changes
+Display changes
~~~~~~~~~~~~~~~
- The default way of printing large DataFrames has changed. DataFrames
@@ -301,7 +301,7 @@ Display Changes
.. _whatsnew_0140.parsing:
-Text Parsing API Changes
+Text parsing API changes
~~~~~~~~~~~~~~~~~~~~~~~~
:func:`read_csv`/:func:`read_table` will now be noisier w.r.t invalid options rather than falling back to the ``PythonParser``.
@@ -321,10 +321,10 @@ Text Parsing API Changes
.. _whatsnew_0140.groupby:
-Groupby API Changes
+Groupby API changes
~~~~~~~~~~~~~~~~~~~
-More consistent behaviour for some groupby methods:
+More consistent behavior for some groupby methods:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
@@ -473,7 +473,7 @@ Some other enhancements to the sql functions include:
.. _whatsnew_0140.slicers:
-MultiIndexing Using Slicers
+MultiIndexing using slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
In 0.14.0 we added a new way to slice MultiIndexed objects.
@@ -625,7 +625,7 @@ Plotting
.. _whatsnew_0140.prior_deprecations:
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are prior version deprecations that are taking effect as of 0.14.0.
@@ -731,7 +731,7 @@ Deprecations
.. _whatsnew_0140.knownissues:
-Known Issues
+Known issues
~~~~~~~~~~~~
- OpenPyXL 2.0.0 breaks backwards compatibility (:issue:`7169`)
@@ -816,7 +816,7 @@ Enhancements
- Implemented ``Panel.pct_change`` (:issue:`6904`)
- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:`rolling_max` defaults to max,
:func:`rolling_min` defaults to min, and all others default to mean (:issue:`6297`)
-- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
+- ``CustomBusinessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of
quantiles.
- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`)
diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst
index 98ebbd6a52344..26018c5745a11 100644
--- a/doc/source/whatsnew/v0.14.1.rst
+++ b/doc/source/whatsnew/v0.14.1.rst
@@ -169,7 +169,7 @@ Experimental
.. _whatsnew_0141.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``DataFrame.where`` with a symmetric shaped frame and a passed other of a DataFrame (:issue:`7506`)
- Bug in Panel indexing with a MultiIndex axis (:issue:`7516`)
@@ -247,7 +247,7 @@ Bug Fixes
- Bug in ``DatetimeIndex`` comparison doesn't handle ``NaT`` properly (:issue:`7529`)
- Bug in passing input with ``tzinfo`` to some offsets ``apply``, ``rollforward`` or ``rollback`` resets ``tzinfo`` or raises ``ValueError`` (:issue:`7465`)
- Bug in ``DatetimeIndex.to_period``, ``PeriodIndex.asobject``, ``PeriodIndex.to_timestamp`` doesn't preserve ``name`` (:issue:`7485`)
-- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestanp`` handle ``NaT`` incorrectly (:issue:`7228`)
+- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestamp`` handle ``NaT`` incorrectly (:issue:`7228`)
- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may return normal ``datetime`` (:issue:`7502`)
- Bug in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`)
- Bug in ``Timestamp.tz_localize`` resets ``nanosecond`` info (:issue:`7534`)
diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst
index f9e47b45f498d..c27ada6ef3b58 100644
--- a/doc/source/whatsnew/v0.15.0.rst
+++ b/doc/source/whatsnew/v0.15.0.rst
@@ -220,7 +220,7 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert
.. _whatsnew_0150.memory:
-Memory Usage
+Memory usage
^^^^^^^^^^^^
Implemented methods to find memory usage of a DataFrame. See the :ref:`FAQ ` for more. (:issue:`6852`).
@@ -339,7 +339,7 @@ Timezone handling improvements
.. _whatsnew_0150.roll:
-Rolling/Expanding Moments improvements
+Rolling/expanding moments improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- :func:`rolling_min`, :func:`rolling_max`, :func:`rolling_cov`, and :func:`rolling_corr`
@@ -701,14 +701,19 @@ Other notable API changes:
This can also be seen in multi-axis indexing with a ``Panel``.
- .. ipython:: python
- :okwarning:
+ .. code-block:: python
+
+ >>> p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4),
+ ... items=['ItemA', 'ItemB'],
+ ... major_axis=[1, 2, 3],
+ ... minor_axis=['A', 'B', 'C', 'D'])
+ >>> p
+
+ Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis)
+ Items axis: ItemA to ItemB
+ Major_axis axis: 1 to 3
+ Minor_axis axis: A to D
- p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4),
- items=['ItemA', 'ItemB'],
- major_axis=[1, 2, 3],
- minor_axis=['A', 'B', 'C', 'D'])
- p
The following would raise ``KeyError`` prior to 0.15.0:
@@ -879,7 +884,7 @@ Other notable API changes:
.. _whatsnew_0150.refactoring:
-Internal Refactoring
+Internal refactoring
^^^^^^^^^^^^^^^^^^^^
In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray``
@@ -1109,7 +1114,7 @@ Performance
.. _whatsnew_0150.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in pivot_table, when using margins and a dict aggfunc (:issue:`8349`)
diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst
index 1091944cb056f..2e036267b5804 100644
--- a/doc/source/whatsnew/v0.15.1.rst
+++ b/doc/source/whatsnew/v0.15.1.rst
@@ -275,7 +275,7 @@ Enhancements
.. _whatsnew_0151.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in unpickling of a ``CustomBusinessDay`` object (:issue:`8591`)
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index 9f0449d6a1754..b58eabaed6127 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -160,11 +160,16 @@ Other enhancements:
- ``Panel`` now supports the ``all`` and ``any`` aggregation functions. (:issue:`8302`):
- .. ipython:: python
- :okwarning:
+ .. code-block:: python
- p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
- p.all()
+ >>> p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
+ >>> p.all()
+ 0 1 2 3
+ 0 True True True True
+ 1 True False True True
+ 2 True True True True
+ 3 False True False True
+ 4 True True True True
- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__.
@@ -191,7 +196,7 @@ Performance
.. _whatsnew_0152.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)
diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst
index 2cb09325c9466..42b3b9332ca98 100644
--- a/doc/source/whatsnew/v0.16.0.rst
+++ b/doc/source/whatsnew/v0.16.0.rst
@@ -39,7 +39,7 @@ New features
.. _whatsnew_0160.enhancements.assign:
-DataFrame Assign
+DataFrame assign
^^^^^^^^^^^^^^^^
Inspired by `dplyr's
@@ -135,7 +135,7 @@ from a ``scipy.sparse.coo_matrix``:
.. _whatsnew_0160.enhancements.string:
-String Methods Enhancements
+String methods enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Following new methods are accessible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9282`, :issue:`9352`, :issue:`9386`, :issue:`9387`, :issue:`9439`)
@@ -228,7 +228,7 @@ sub-class of ``datetime.timedelta``. Mentioned :ref:`here `_ for similar
but more refined functionality (:issue:`3445`).
The documentation includes some examples how to convert your existing code
- using ``rplot`` to seaborn: :ref:`rplot docs `.
+ from ``rplot`` to seaborn `here `__.
- The ``pandas.sandbox.qtpandas`` interface is deprecated and will be removed in a future version.
We refer users to the external package `pandas-qt `_. (:issue:`9615`)
@@ -555,7 +555,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0160.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
@@ -573,7 +573,7 @@ Performance Improvements
.. _whatsnew_0160.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Changed ``.to_html`` to remove leading/trailing spaces in table body (:issue:`4987`)
@@ -638,7 +638,7 @@ Bug Fixes
- ``Series`` number formatting inconsistent when truncated (:issue:`8532`).
- Previous Behavior
+ Previous behavior
.. code-block:: python
@@ -655,7 +655,7 @@ Bug Fixes
129 1.0000
Length: 130, dtype: float64
- New Behavior
+ New behavior
.. code-block:: python
diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst
index cbcb23e356577..502c1287efdbe 100644
--- a/doc/source/whatsnew/v0.16.1.rst
+++ b/doc/source/whatsnew/v0.16.1.rst
@@ -216,7 +216,7 @@ when sampling from rows.
.. _whatsnew_0161.enhancements.string:
-String Methods Enhancements
+String methods enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^
:ref:`Continuing from v0.16.0 `, the following
@@ -279,7 +279,7 @@ enhancements make string operations easier and more consistent with standard pyt
.. _whatsnew_0161.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`)
@@ -351,12 +351,12 @@ Deprecations
.. _whatsnew_0161.index_repr:
-Index Representation
+Index representation
~~~~~~~~~~~~~~~~~~~~
The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -378,7 +378,7 @@ Previous Behavior
[2013-01-01 00:00:00-05:00, ..., 2013-04-14 00:00:00-04:00]
Length: 104, Freq: D, Timezone: US/Eastern
-New Behavior
+New behavior
.. ipython:: python
@@ -399,7 +399,7 @@ New Behavior
.. _whatsnew_0161.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
@@ -409,7 +409,7 @@ Performance Improvements
.. _whatsnew_0161.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug where labels did not appear properly in the legend of ``DataFrame.plot()``, passing ``label=`` arguments works, and Series indices are no longer mutated. (:issue:`9542`)
diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst
index ca0ad8d3ae7f9..543f9c6bbf300 100644
--- a/doc/source/whatsnew/v0.16.2.rst
+++ b/doc/source/whatsnew/v0.16.2.rst
@@ -86,7 +86,7 @@ See the :ref:`documentation ` for more. (:issue:`10129`)
.. _whatsnew_0162.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- Added `rsplit` to Index/Series StringMethods (:issue:`10303`)
@@ -105,7 +105,7 @@ Other Enhancements
.. _whatsnew_0162.api:
-API Changes
+API changes
~~~~~~~~~~~
- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in the constructor instead of returning an incorrect result (:issue:`10217`).
@@ -113,7 +113,7 @@ API Changes
.. _whatsnew_0162.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved ``Series.resample`` performance with ``dtype=datetime64[ns]`` (:issue:`7754`)
@@ -121,7 +121,7 @@ Performance Improvements
.. _whatsnew_0162.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``Series.hist`` raises an error when a one row ``Series`` was given (:issue:`10214`)
diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst
index c53fee42548e9..67abad659dc8d 100644
--- a/doc/source/whatsnew/v0.17.0.rst
+++ b/doc/source/whatsnew/v0.17.0.rst
@@ -5,10 +5,6 @@ v0.17.0 (October 9, 2015)
{{ header }}
-.. ipython:: python
- :suppress:
-
-
This is a major release from 0.16.2 and includes a small number of API changes, several new features,
enhancements, and performance improvements along with a large number of bug fixes. We recommend that all
@@ -107,7 +103,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f
There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but
functionally these are the same.
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -119,7 +115,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f
In [2]: pd.date_range('20130101', periods=3, tz='US/Eastern').dtype
Out[2]: dtype('` for more details.
.. _whatsnew_0170.matheval:
-Support for Math Functions in .eval()
+Support for math functions in .eval()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:meth:`~pandas.eval` now supports calling math functions (:issue:`4893`)
@@ -333,7 +329,7 @@ has been changed to make this keyword unnecessary - the change is shown below.
.. _whatsnew_0170.gbq:
-Google BigQuery Enhancements
+Google BigQuery enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Added ability to automatically create a table/dataset using the :func:`pandas.io.gbq.to_gbq` function if the destination table/dataset does not exist. (:issue:`8325`, :issue:`11121`).
- Added ability to replace an existing table and schema when calling the :func:`pandas.io.gbq.to_gbq` function via the ``if_exists`` argument. See the `docs `__ for more details (:issue:`8325`).
@@ -343,7 +339,7 @@ Google BigQuery Enhancements
.. _whatsnew_0170.east_asian_width:
-Display Alignment with Unicode East Asian Width
+Display alignment with Unicode East Asian width
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
@@ -415,7 +411,7 @@ Other enhancements
bar = pd.Series([1, 2])
baz = pd.Series([4, 5])
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -425,7 +421,7 @@ Other enhancements
0 1 1 4
1 2 2 5
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -613,14 +609,14 @@ In prior versions it was ``errors='ignore'``. Furthermore, the ``coerce`` argume
has been deprecated in favor of ``errors='coerce'``. This means that invalid parsing
will raise rather that return the original input as in previous versions. (:issue:`10636`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
In [2]: pd.to_datetime(['2009-07-31', 'asd'])
Out[2]: array(['2009-07-31', 'asd'], dtype=object)
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -642,7 +638,7 @@ To keep the previous behavior, you can use ``errors='ignore'``:
Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
has been deprecated in favor of ``errors='coerce'``.
-Consistent Parsing
+Consistent parsing
""""""""""""""""""
The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has
@@ -652,7 +648,7 @@ Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime
uses the beginning of the year. ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex``
can parse, such as a quarterly string.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -667,7 +663,7 @@ Previous Behavior:
v0.17.0 can parse them as below. It works on ``DatetimeIndex`` also.
-New Behavior:
+New behavior:
.. ipython:: python
@@ -685,7 +681,7 @@ New Behavior:
pd.Timestamp.now()
pd.Timestamp.now() + offsets.DateOffset(years=1)
-Changes to Index Comparisons
+Changes to Index comparisons
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`9947`, :issue:`10637`)
@@ -693,7 +689,7 @@ Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`994
Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise
a ``ValueError``. This is to be consistent with the behavior of ``Series``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -706,7 +702,7 @@ Previous Behavior:
In [4]: pd.Index([1, 2, 3]) == pd.Index([1, 2])
Out[4]: False
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -733,7 +729,7 @@ or it can return False if broadcasting can not be done:
np.array([1, 2, 3]) == np.array([1, 2])
-Changes to Boolean Comparisons vs. None
+Changes to boolean comparisons vs. None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to comparing with ``np.nan``, rather than raise ``TypeError``. (:issue:`1079`).
@@ -744,14 +740,14 @@ Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to compar
s.iloc[1] = None
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
In [5]: s == None
TypeError: Could not compare type with Series
-New Behavior:
+New behavior:
.. ipython:: python
@@ -780,7 +776,7 @@ HDFStore dropna behavior
The default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`)
-Previous Behavior:
+Previous behavior:
.. ipython:: python
@@ -806,7 +802,7 @@ Previous Behavior:
2 2 NaN
-New Behavior:
+New behavior:
.. ipython:: python
@@ -886,7 +882,7 @@ Changes to ``Categorical.unique``
cat
cat.unique()
-Changes to ``bool`` passed as ``header`` in Parsers
+Changes to ``bool`` passed as ``header`` in parsers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In earlier versions of pandas, if a bool was passed the ``header`` argument of
@@ -905,7 +901,7 @@ A ``bool`` input to ``header`` will now raise a ``TypeError``
.. _whatsnew_0170.api_breaking.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
@@ -1020,7 +1016,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0170.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Development support for benchmarking with the `Air Speed Velocity library `_ (:issue:`8361`)
@@ -1043,7 +1039,7 @@ Performance Improvements
.. _whatsnew_0170.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in incorrect computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst
index 233414dae957d..55080240f2a55 100644
--- a/doc/source/whatsnew/v0.17.1.rst
+++ b/doc/source/whatsnew/v0.17.1.rst
@@ -5,10 +5,6 @@ v0.17.1 (November 21, 2015)
{{ header }}
-.. ipython:: python
- :suppress:
-
-
.. note::
@@ -35,7 +31,7 @@ New features
.. _whatsnew_0171.style:
-Conditional HTML Formatting
+Conditional HTML formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
@@ -65,7 +61,7 @@ We can render the HTML to get the following table.
:file: whatsnew_0171_html_table.html
:class:`~pandas.core.style.Styler` interacts nicely with the Jupyter Notebook.
-See the :ref:`documentation ` for more.
+See the :ref:`documentation ` for more.
.. _whatsnew_0171.enhancements:
@@ -139,7 +135,7 @@ Deprecations
.. _whatsnew_0171.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Checking monotonic-ness before sorting on an index (:issue:`11080`)
@@ -156,7 +152,7 @@ Performance Improvements
.. _whatsnew_0171.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``SparseArray.__iter__()`` now does not cause ``PendingDeprecationWarning`` in Python 3.5 (:issue:`11622`)
diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst
index 9ff6ad7188f5a..a7174c6325f86 100644
--- a/doc/source/whatsnew/v0.18.0.rst
+++ b/doc/source/whatsnew/v0.18.0.rst
@@ -62,7 +62,7 @@ Window functions have been refactored to be methods on ``Series/DataFrame`` obje
df = pd.DataFrame({'A': range(10), 'B': np.random.randn(10)})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -82,7 +82,7 @@ Previous Behavior:
8 7 0.079587
9 8 -0.954504
-New Behavior:
+New behavior:
.. ipython:: python
@@ -145,14 +145,14 @@ This continues to work as before for function or dict-like values.
.. _whatsnew_0180.enhancements.rangeindex:
-Range Index
+Range index
^^^^^^^^^^^
A ``RangeIndex`` has been added to the ``Int64Index`` sub-classes to support a memory saving alternative for common use cases. This has a similar implementation to the python ``range`` object (``xrange`` in python 2), in that it only stores the start, stop, and step values for the index. It will transparently interact with the user API, converting to ``Int64Index`` if needed.
This will now be the default constructed index for ``NDFrame`` objects, rather than previous an ``Int64Index``. (:issue:`939`, :issue:`12070`, :issue:`12071`, :issue:`12109`, :issue:`12888`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -168,7 +168,7 @@ Previous Behavior:
Out[6]: 8000
-New Behavior:
+New behavior:
.. ipython:: python
@@ -341,13 +341,13 @@ In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available throug
s
s.dt.round('D')
-Formatting of Integers in FloatIndex
+Formatting of integers in FloatIndex
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Integers in ``FloatIndex``, e.g. 1., are now formatted with a decimal point and a ``0`` digit, e.g. ``1.0`` (:issue:`11713`)
This change not only affects the display to the console, but also the output of IO methods like ``.to_csv`` or ``.to_html``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -369,7 +369,7 @@ Previous Behavior:
2,3
-New Behavior:
+New behavior:
.. ipython:: python
@@ -383,7 +383,7 @@ Changes to dtype assignment behaviors
When a DataFrame's slice is updated with a new slice of the same dtype, the dtype of the DataFrame will now remain the same. (:issue:`10503`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -406,7 +406,7 @@ Previous Behavior:
b int64
dtype: object
-New Behavior:
+New behavior:
.. ipython:: python
@@ -419,7 +419,7 @@ New Behavior:
When a DataFrame's integer slice is partially updated with a new slice of floats that could potentially be down-casted to integer without losing precision, the dtype of the slice will be set to float instead of integer.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -443,7 +443,7 @@ Previous Behavior:
10 4 5 1
8 12 7 8 9
-New Behavior:
+New behavior:
.. ipython:: python
@@ -484,7 +484,7 @@ See the `xarray full-documentation here `__
* major_axis (major_axis) int64 0 1 2
* minor_axis (minor_axis) int64 0 1 2 3
-Latex Representation
+Latex representation
^^^^^^^^^^^^^^^^^^^^
``DataFrame`` has gained a ``._repr_latex_()`` method in order to allow for conversion to latex in a ipython/jupyter notebook using nbconvert. (:issue:`11778`)
@@ -981,7 +981,7 @@ assignments are valid for multi-line expressions.
.. _whatsnew_0180.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`)
@@ -1074,7 +1074,7 @@ In 0.18.0, this deprecation warning is removed and these will now raise a ``Type
s2 = pd.Series([1, 2, 3], index=list('abc'))
s2
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1104,7 +1104,7 @@ Previous Behavior:
c 3
dtype: int64
-New Behavior:
+New behavior:
For iloc, getting & setting via a float scalar will always raise.
@@ -1180,7 +1180,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0180.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``andrews_curves`` (:issue:`11534`)
diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
index 069395c2e0f36..7e06e5050c5f0 100644
--- a/doc/source/whatsnew/v0.18.1.rst
+++ b/doc/source/whatsnew/v0.18.1.rst
@@ -31,7 +31,7 @@ New features
.. _whatsnew_0181.enhancements.custombusinesshour:
-Custom Business Hour
+Custom business hour
^^^^^^^^^^^^^^^^^^^^
The ``CustomBusinessHour`` is a mixture of ``BusinessHour`` and ``CustomBusinessDay`` which
@@ -199,7 +199,7 @@ On other levels
.. _whatsnew_0181.enhancements.assembling:
-Assembling Datetimes
+Assembling datetimes
^^^^^^^^^^^^^^^^^^^^
``pd.to_datetime()`` has gained the ability to assemble datetimes from a passed in ``DataFrame`` or a dict. (:issue:`8158`).
@@ -226,7 +226,7 @@ You can pass only the columns that you need to assemble.
.. _whatsnew_0181.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``pd.read_csv()`` now supports ``delim_whitespace=True`` for the Python engine (:issue:`12958`)
@@ -317,7 +317,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_
'B': [1, 2, 3]})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -333,7 +333,7 @@ Previous Behavior:
1 2
Name: B, dtype: int64
-New Behavior:
+New behavior:
.. ipython:: python
@@ -348,7 +348,7 @@ Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=
df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b'])
df['c'] = np.random.randint(0, 4, 100)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -370,7 +370,7 @@ Previous Behavior:
2 -0.720589 0.887163
3 0.859588 -0.636524
-New Behavior:
+New behavior:
.. ipython:: python
@@ -446,7 +446,7 @@ Previous behavior:
2000-11-30 value 13
dtype: int64
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -580,7 +580,7 @@ Deprecations
.. _whatsnew_0181.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved speed of SAS reader (:issue:`12656`, :issue:`12961`)
@@ -601,7 +601,7 @@ Performance Improvements
.. _whatsnew_0181.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)
- Bug in ``groupby.transform(..)`` when ``axis=1`` is specified with a non-monotonic ordered index (:issue:`12713`)
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index de29a1eb93709..1dad8769a6b39 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -218,7 +218,7 @@ contained the values ``[0, 3]``.
**New behavior**:
.. ipython:: python
- :okwarning:
+ :okexcept:
pd.read_csv(StringIO(data), names=names)
@@ -264,7 +264,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
.. _whatsnew_0190.enhancements.union_categoricals:
-Categorical Concatenation
+Categorical concatenation
^^^^^^^^^^^^^^^^^^^^^^^^^
- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`13763`, :issue:`13846`, :issue:`14173`)
@@ -298,7 +298,7 @@ Categorical Concatenation
.. _whatsnew_0190.enhancements.semi_month_offsets:
-Semi-Month Offsets
+Semi-month offsets
^^^^^^^^^^^^^^^^^^
Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS').
@@ -596,7 +596,7 @@ Comparison operators
Comparison operators raise ``ValueError`` when ``.index`` are different.
-**Previous Behavior** (``Series``):
+**Previous behavior** (``Series``):
``Series`` compared values ignoring the ``.index`` as long as both had the same length:
@@ -631,7 +631,7 @@ Comparison operators raise ``ValueError`` when ``.index`` are different.
s1.eq(s2)
-**Current Behavior** (``DataFrame``, no change):
+**Current behavior** (``DataFrame``, no change):
.. code-block:: ipython
@@ -675,7 +675,7 @@ Logical operators align both ``.index`` of left and right hand side.
s1 & s2.reindex_like(s1)
-**Current Behavior** (``DataFrame``, no change):
+**Current behavior** (``DataFrame``, no change):
.. ipython:: python
@@ -1324,7 +1324,7 @@ operations on that platform.
.. _whatsnew_0190.api.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout (:issue:`14101`).
@@ -1406,7 +1406,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0190.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`)
@@ -1426,7 +1426,7 @@ Performance Improvements
.. _whatsnew_0190.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
@@ -1513,7 +1513,7 @@ Bug Fixes
- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`)
- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`)
- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`)
-- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`)
+- Bug in ``Period`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`)
- Bug in ``pd.set_eng_float_format()`` that would prevent NaN and Inf from formatting (:issue:`11981`)
- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`)
- Clean some compile time warnings in datetime parsing (:issue:`13607`)
diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst
index 12f3e985565e0..a89d1461073bd 100644
--- a/doc/source/whatsnew/v0.19.1.rst
+++ b/doc/source/whatsnew/v0.19.1.rst
@@ -22,7 +22,7 @@ We recommend that all users upgrade to this version.
.. _whatsnew_0191.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
@@ -34,7 +34,7 @@ Performance Improvements
.. _whatsnew_0191.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`)
diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst
index 14310ceb45b4a..023bc78081ec9 100644
--- a/doc/source/whatsnew/v0.19.2.rst
+++ b/doc/source/whatsnew/v0.19.2.rst
@@ -39,7 +39,7 @@ The ``pd.merge_asof()``, added in 0.19.0, gained some improvements:
.. _whatsnew_0192.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance regression with ``PeriodIndex`` (:issue:`14822`)
@@ -50,7 +50,7 @@ Performance Improvements
.. _whatsnew_0192.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Compat with python 3.6 for pickling of some offsets (:issue:`14685`)
- Compat with python 3.6 for some indexing exception types (:issue:`14684`, :issue:`14689`)
diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
index 6a88a5810eca4..ef6108ae3ec90 100644
--- a/doc/source/whatsnew/v0.20.0.rst
+++ b/doc/source/whatsnew/v0.20.0.rst
@@ -151,7 +151,7 @@ commonly called 'unix epoch' or POSIX time. This was the previous default, so th
.. _whatsnew_0200.enhancements.groupby_access:
-Groupby Enhancements
+Groupby enhancements
^^^^^^^^^^^^^^^^^^^^
Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names. Previously, only column names could be referenced. This allows to easily group by a column and index level at the same time. (:issue:`5677`)
@@ -240,7 +240,7 @@ The default is to infer the compression type from the extension (``compression='
.. _whatsnew_0200.enhancements.uint64_support:
-UInt64 Support Improved
+UInt64 support improved
^^^^^^^^^^^^^^^^^^^^^^^
Pandas has significantly improved support for operations involving unsigned,
@@ -263,7 +263,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
.. _whatsnew_0200.enhancements.groupy_categorical:
-GroupBy on Categoricals
+GroupBy on categoricals
^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
@@ -280,7 +280,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
ordered=True)})
df
-**Previous Behavior**:
+**Previous behavior**:
.. code-block:: ipython
@@ -288,7 +288,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
---------------------------------------------------------------------------
ValueError: items in new_categories are not the same as in old categories
-**New Behavior**:
+**New behavior**:
.. ipython:: python
@@ -296,7 +296,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
.. _whatsnew_0200.enhancements.table_schema:
-Table Schema Output
+Table schema output
^^^^^^^^^^^^^^^^^^^
The new orient ``'table'`` for :meth:`DataFrame.to_json`
@@ -387,7 +387,7 @@ For example, after running the following, ``styled.xlsx`` renders as below:
import os
os.remove('styled.xlsx')
-See the :ref:`Style documentation ` for more detail.
+See the :ref:`Style documentation ` for more detail.
.. _whatsnew_0200.enhancements.intervalindex:
@@ -396,7 +396,7 @@ IntervalIndex
pandas has gained an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
notation, specifically as a return type for the categories in :func:`cut` and :func:`qcut`. The ``IntervalIndex`` allows some unique indexing, see the
-:ref:`docs `. (:issue:`7640`, :issue:`8625`)
+:ref:`docs `. (:issue:`7640`, :issue:`8625`)
.. warning::
@@ -457,7 +457,7 @@ Selecting via a scalar value that is contained *in* the intervals.
.. _whatsnew_0200.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`)
@@ -497,7 +497,7 @@ Other Enhancements
- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`)
- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`).
- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`)
-- :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`)
+- :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`)
- :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`)
- Compatibility with Jupyter notebook 5.0; MultiIndex column labels are left-aligned and MultiIndex row-labels are top-aligned (:issue:`15379`)
- ``TimedeltaIndex`` now has a custom date-tick formatter specifically designed for nanosecond level precision (:issue:`8711`)
@@ -580,7 +580,7 @@ Map on Index types now return other Index types
mi = pd.MultiIndex.from_tuples([(1, 2), (2, 4)])
mi
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -596,7 +596,7 @@ Previous Behavior:
In [8]: mi.map(lambda x: x[0])
Out[8]: array([1, 2])
-New Behavior:
+New behavior:
.. ipython:: python
@@ -616,7 +616,7 @@ New Behavior:
.tz_localize('Asia/Tokyo'))
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -627,7 +627,7 @@ Previous Behavior:
2 2
dtype: int32
-New Behavior:
+New behavior:
.. ipython:: python
@@ -653,7 +653,7 @@ Previous behaviour:
In [2]: idx.hour
Out[2]: array([ 0, 10, 20, 6, 16], dtype=int32)
-New Behavior:
+New behavior:
.. ipython:: python
@@ -697,7 +697,7 @@ data-types would yield different return types. These are now made consistent. (:
...: pd.Timestamp('20160101', tz='US/Eastern')])
Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -727,7 +727,7 @@ data-types would yield different return types. These are now made consistent. (:
In [2]: pd.unique(pd.Series(list('baabc'), dtype='category'))
Out[2]: array(['b', 'a', 'c'], dtype=object)
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -737,7 +737,7 @@ data-types would yield different return types. These are now made consistent. (:
.. _whatsnew_0200.api_breaking.s3:
-S3 File Handling
+S3 file handling
^^^^^^^^^^^^^^^^
pandas now uses `s3fs `_ for handling S3 connections. This shouldn't break
@@ -746,7 +746,7 @@ in prior versions of pandas. (:issue:`11915`).
.. _whatsnew_0200.api_breaking.partial_string_indexing:
-Partial String Indexing Changes
+Partial string indexing changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:ref:`DatetimeIndex Partial String Indexing ` now works as an exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details.
@@ -756,7 +756,7 @@ Partial String Indexing Changes
df = pd.DataFrame({'a': [1, 2, 3]}, pd.DatetimeIndex(['2011-12-31 23:59:59',
'2012-01-01 00:00:00',
'2012-01-01 00:00:01']))
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -771,7 +771,7 @@ Previous Behavior:
Name: a, dtype: int64
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -797,7 +797,7 @@ Now the smallest acceptable dtype will be used (:issue:`13247`)
df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2))
df2.dtypes
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -806,7 +806,7 @@ Previous Behavior:
0 float64
dtype: object
-New Behavior:
+New behavior:
.. ipython:: python
@@ -823,12 +823,12 @@ currently released version of ``pandas-gbq=0.1.4``. Documentation is now hosted
.. _whatsnew_0200.api_breaking.memory_usage:
-Memory Usage for Index is more Accurate
+Memory usage for Index is more accurate
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -843,7 +843,7 @@ Previous Behavior:
In [11]: index.memory_usage(deep=True)
Out[11]: 180
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -900,7 +900,7 @@ doesn't behave as desired.
[[0, 0, 1, 1], [0, 1, 0, 1]]))
df
-Previous Behavior:
+Previous behavior:
.. code-block:: python
@@ -918,7 +918,7 @@ Previous Behavior:
In [15]: df.sort_index().index.is_monotonic
Out[15]: False
-New Behavior:
+New behavior:
.. ipython:: python
@@ -929,13 +929,13 @@ New Behavior:
.. _whatsnew_0200.api_breaking.groupby_describe:
-Groupby Describe Formatting
+Groupby describe formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -970,7 +970,7 @@ Previous Behavior:
1 1.5 0.707107 1 2
2 3.5 0.707107 3 4
-New Behavior:
+New behavior:
.. ipython:: python
@@ -982,7 +982,7 @@ New Behavior:
.. _whatsnew_0200.api_breaking.rolling_pairwise:
-Window Binary Corr/Cov operations return a MultiIndex DataFrame
+Window binary corr/cov operations return a MultiIndex DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object,
@@ -1000,7 +1000,7 @@ See the section on :ref:`Windowed Binary Operations ` for
periods=100, freq='D', name='foo'))
df.tail()
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1012,7 +1012,7 @@ Previous Behavior:
Major_axis axis: A to B
Minor_axis axis: A to B
-New Behavior:
+New behavior:
.. ipython:: python
@@ -1040,7 +1040,7 @@ usually resulting in an invalid comparison, returning an empty result frame. The
df.to_hdf('store.h5', 'key', format='table', data_columns=True)
df.dtypes
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1050,7 +1050,7 @@ Previous Behavior:
^
SyntaxError: invalid token
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -1084,14 +1084,14 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
right = pd.Index([1, 2, 3])
right
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
In [4]: left.intersection(right)
Out[4]: Int64Index([1, 2], dtype='int64')
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -1106,7 +1106,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3])
right
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -1116,7 +1116,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
1 10 100
2 20 200
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -1124,7 +1124,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
.. _whatsnew_0200.api_breaking.pivot_table:
-Pivot Table always returns a DataFrame
+Pivot table always returns a DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The documentation for :meth:`pivot_table` states that a ``DataFrame`` is *always* returned. Here a bug
@@ -1137,7 +1137,7 @@ is fixed that allowed this to return a ``Series`` under certain circumstance. (:
'col3': [1, 3, 9]})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1149,7 +1149,7 @@ Previous Behavior:
9 E 5
Name: col1, dtype: int64
-New Behavior:
+New behavior:
.. ipython:: python
@@ -1157,7 +1157,7 @@ New Behavior:
.. _whatsnew_0200.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`).
@@ -1192,12 +1192,12 @@ Other API Changes
.. _whatsnew_0200.privacy:
-Reorganization of the library: Privacy Changes
+Reorganization of the library: privacy changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. _whatsnew_0200.privacy.extensions:
-Modules Privacy Has Changed
+Modules privacy has changed
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API.
@@ -1327,7 +1327,7 @@ Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some example
df
-Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column.
+Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column.
.. code-block:: ipython
@@ -1532,7 +1532,7 @@ Should be changed to:
.. _whatsnew_0200.deprecations.other:
-Other Deprecations
+Other deprecations
^^^^^^^^^^^^^^^^^^
- ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`)
@@ -1564,7 +1564,7 @@ Removal of prior version deprecations/changes
- The ``pandas.rpy`` module is removed. Similar functionality can be accessed
through the `rpy2 `__ project.
- See the :ref:`R interfacing docs ` for more details.
+ See the `R interfacing docs `__ for more details.
- The ``pandas.io.ga`` module with a ``google-analytics`` interface is removed (:issue:`11308`).
Similar functionality can be found in the `Google2Pandas `__ package.
- ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`)
@@ -1584,7 +1584,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0200.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
@@ -1606,7 +1606,7 @@ Performance Improvements
.. _whatsnew_0200.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -1713,7 +1713,7 @@ Plotting
- Bug in the date and time converters pandas registers with matplotlib not handling multiple dimensions (:issue:`16026`)
- Bug in ``pd.scatter_matrix()`` could accept either ``color`` or ``c``, but not both (:issue:`14855`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst
index b2592579eb03f..232d1d283d9bd 100644
--- a/doc/source/whatsnew/v0.20.2.rst
+++ b/doc/source/whatsnew/v0.20.2.rst
@@ -35,7 +35,7 @@ Enhancements
.. _whatsnew_0202.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance regression fix when indexing with a list-like (:issue:`16285`)
@@ -46,7 +46,7 @@ Performance Improvements
.. _whatsnew_0202.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when
@@ -97,7 +97,7 @@ Plotting
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in creating a time-based rolling window on an empty ``DataFrame`` (:issue:`15819`)
diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst
index 8dc6acc2074bd..72faabd95bf1f 100644
--- a/doc/source/whatsnew/v0.20.3.rst
+++ b/doc/source/whatsnew/v0.20.3.rst
@@ -20,7 +20,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. _whatsnew_0203.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Fixed a bug in failing to compute rolling computations of a column-MultiIndexed ``DataFrame`` (:issue:`16789`, :issue:`16825`)
diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst
index 5c6f1d1af6b54..34b610e8af0b3 100644
--- a/doc/source/whatsnew/v0.21.0.rst
+++ b/doc/source/whatsnew/v0.21.0.rst
@@ -263,7 +263,7 @@ Now, to find prices per store/product, we can simply do:
See the :ref:`documentation ` for more.
-.. _whatsnew_0210.enhancements.reanme_categories:
+.. _whatsnew_0210.enhancements.rename_categories:
``Categorical.rename_categories`` accepts a dict-like
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -300,7 +300,7 @@ as in :meth:`DataFrame.rename`.
.. _whatsnew_0210.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
New functions or methods
@@ -412,13 +412,13 @@ Previously WITH ``bottleneck``:
In [2]: s.sum()
Out[2]: 0.0
-New Behavior, without regard to the bottleneck installation:
+New behavior, without regard to the bottleneck installation:
.. ipython:: python
s.sum()
-Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation:
+Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottleneck`` installation:
.. code-block:: ipython
@@ -434,7 +434,7 @@ but for consistency with the all-NaN case, this was changed to return NaN as wel
.. _whatsnew_0210.api_breaking.loc:
-Indexing with a list with missing labels is Deprecated
+Indexing with a list with missing labels is deprecated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Previously, selecting with a list of labels, where one or more labels were missing would always succeed, returning ``NaN`` for missing labels.
@@ -448,7 +448,7 @@ See the :ref:`deprecation docs `.
s = pd.Series([1, 2, 3])
s
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -460,7 +460,7 @@ Previous Behavior
dtype: float64
-Current Behavior
+Current behavior
.. code-block:: ipython
@@ -492,7 +492,7 @@ Selection with all keys found is unchanged.
.. _whatsnew_0210.api.na_changes:
-NA naming Changes
+NA naming changes
^^^^^^^^^^^^^^^^^
In order to promote more consistency among the pandas API, we have added additional top-level
@@ -524,7 +524,7 @@ Previously:
In [2]: type(list(s)[0])
Out[2]: numpy.int64
-New Behaviour:
+New behavior:
.. ipython:: python
@@ -544,7 +544,7 @@ Previously:
In [8]: type(df.to_dict()['a'][0])
Out[8]: numpy.int64
-New Behaviour:
+New behavior:
.. ipython:: python
@@ -561,7 +561,7 @@ you would get a label based selection, potentially duplicating result labels, ra
(where ``True`` selects elements), this was inconsistent how a boolean numpy array indexed. The new behavior is to
act like a boolean numpy array indexer. (:issue:`17738`)
-Previous Behavior:
+Previous behavior:
.. ipython:: python
@@ -578,7 +578,7 @@ Previous Behavior:
True 2
dtype: int64
-Current Behavior
+Current behavior
.. ipython:: python
@@ -588,7 +588,7 @@ Current Behavior
Furthermore, previously if you had an index that was non-numeric (e.g. strings), then a boolean Index would raise a ``KeyError``.
This will now be treated as a boolean indexer.
-Previously Behavior:
+Previously behavior:
.. ipython:: python
@@ -600,7 +600,7 @@ Previously Behavior:
In [39]: s.loc[pd.Index([True, False, True])]
KeyError: "None of [Index([True, False, True], dtype='object')] are in the [index]"
-Current Behavior
+Current behavior
.. ipython:: python
@@ -614,7 +614,7 @@ Current Behavior
In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -634,7 +634,7 @@ Previous Behavior:
In [5]: resampled.index
Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC')
-New Behavior:
+New behavior:
.. ipython:: python
@@ -650,7 +650,7 @@ New Behavior:
Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -670,7 +670,7 @@ Previous Behavior:
open high low close
2000-01 0 9 0 9
-New Behavior:
+New behavior:
.. ipython:: python
@@ -732,7 +732,7 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
.. _whatsnew_0210.api_breaking.dtype_conversions:
-Dtype Conversions
+Dtype conversions
^^^^^^^^^^^^^^^^^
Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preserve the bools with ``object`` dtypes. (:issue:`16821`).
@@ -752,7 +752,7 @@ Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignmen
2 3
dtype: int64
-New Behavior
+New behavior
.. ipython:: python
@@ -789,7 +789,7 @@ These now coerce to ``object`` dtype.
.. _whatsnew_210.api.multiindex_single:
-MultiIndex Constructor with a Single Level
+MultiIndex constructor with a single level
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``MultiIndex`` constructors no longer squeezes a MultiIndex with all
@@ -818,7 +818,7 @@ UTC Localization with Series
Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`).
-Previous Behavior
+Previous behavior
.. ipython:: python
@@ -833,7 +833,7 @@ Previous Behavior
2 2013-01-01
dtype: datetime64[ns]
-New Behavior
+New behavior
.. ipython:: python
@@ -843,14 +843,14 @@ Additionally, DataFrames with datetime columns that were parsed by :func:`read_s
.. _whatsnew_0210.api.consistency_of_range_functions:
-Consistency of Range Functions
+Consistency of range functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`).
One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -863,7 +863,7 @@ Previous Behavior:
In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q')
Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC')
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -877,7 +877,7 @@ New Behavior:
Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -888,7 +888,7 @@ Previous Behavior:
dtype='interval[int64]')
-New Behavior:
+New behavior:
.. ipython:: python
@@ -896,7 +896,7 @@ New Behavior:
.. _whatsnew_0210.api.mpl_converters:
-No Automatic Matplotlib Converters
+No automatic Matplotlib converters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas no longer registers our ``date``, ``time``, ``datetime``,
@@ -915,7 +915,7 @@ converters on first-use (:issue:`17710`).
.. _whatsnew_0210.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
@@ -1024,7 +1024,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0210.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
@@ -1036,7 +1036,7 @@ Performance Improvements
.. _whatsnew_0210.docs:
-Documentation Changes
+Documentation changes
~~~~~~~~~~~~~~~~~~~~~
- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`)
@@ -1044,7 +1044,7 @@ Documentation Changes
.. _whatsnew_0210.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -1114,7 +1114,7 @@ Plotting
- Bug causing ``plotting.parallel_coordinates`` to reset the random seed when using random colors (:issue:`17525`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)
diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst
index c8897ca86e8cf..64f3339834b38 100644
--- a/doc/source/whatsnew/v0.21.1.rst
+++ b/doc/source/whatsnew/v0.21.1.rst
@@ -31,7 +31,7 @@ Highlights include:
.. _whatsnew_0211.converters:
-Restore Matplotlib datetime Converter Registration
+Restore Matplotlib datetime converter registration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Pandas implements some matplotlib converters for nicely formatting the axis
@@ -77,7 +77,7 @@ Improvements to the Parquet IO functionality
.. _whatsnew_0211.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`)
@@ -93,14 +93,14 @@ Deprecations
.. _whatsnew_0211.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of plotting large series/dataframes (:issue:`18236`).
.. _whatsnew_0211.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -143,7 +143,7 @@ Plotting
- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`)
diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
index b38fcd9d62af4..ea36b35d61740 100644
--- a/doc/source/whatsnew/v0.22.0.rst
+++ b/doc/source/whatsnew/v0.22.0.rst
@@ -37,7 +37,7 @@ time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``.
Based on feedback, we've partially reverted those changes.
-Arithmetic Operations
+Arithmetic operations
^^^^^^^^^^^^^^^^^^^^^
The default sum for empty or all-*NA* ``Series`` is now ``0``.
@@ -93,7 +93,7 @@ returning ``1`` instead.
These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well.
Finally, a few less obvious places in pandas are affected by this change.
-Grouping by a Categorical
+Grouping by a categorical
^^^^^^^^^^^^^^^^^^^^^^^^^
Grouping by a ``Categorical`` and summing now returns ``0`` instead of
@@ -196,7 +196,7 @@ Once again, the ``min_count`` keyword is available to restore the 0.21 behavior.
pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)
-Rolling and Expanding
+Rolling and expanding
^^^^^^^^^^^^^^^^^^^^^
Rolling and expanding already have a ``min_periods`` keyword that behaves
diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst
index 98479fa30eb15..f4c283ea742f7 100644
--- a/doc/source/whatsnew/v0.23.0.rst
+++ b/doc/source/whatsnew/v0.23.0.rst
@@ -22,7 +22,7 @@ Highlights include:
- :ref:`Instantiation from dicts respects order for Python 3.6+ `.
- :ref:`Dependent column arguments for assign `.
- :ref:`Merging / sorting on a combination of columns and index levels `.
-- :ref:`Extending Pandas with custom types `.
+- :ref:`Extending pandas with custom types `.
- :ref:`Excluding unobserved categories from groupby `.
- :ref:`Changes to make output shape of DataFrame.apply consistent