diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f7d0083b86a01..3303483c50e20 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -124,6 +124,25 @@ def time_dropna(self, dtype): self.s.dropna() +class SearchSorted(object): + + goal_time = 0.2 + params = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'str'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**5 + data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) + self.s = Series(data) + + def time_searchsorted(self, dtype): + key = '2' if dtype == 'str' else 2 + self.s.searchsorted(key) + + class Map(object): params = ['dict', 'Series'] diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index 986855c464852..c68b51fbd6644 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -21,6 +21,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - html5lib==1.0b2 - beautifulsoup4==4.2.1 diff --git a/ci/deps/azure-27-locale.yaml b/ci/deps/azure-27-locale.yaml index f73079ecbe3d2..5679c503caddc 100644 --- a/ci/deps/azure-27-locale.yaml +++ b/ci/deps/azure-27-locale.yaml @@ -24,6 +24,7 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - html5lib==1.0b2 - beautifulsoup4==4.2.1 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 6b8d38fd25082..de1f4ad0e9a76 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -30,5 +30,6 @@ dependencies: - pytest-xdist - pytest-mock - moto + - isort - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 569b71dae003b..a89e63a2b7d3a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -28,6 +28,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - hypothesis>=3.58.0 - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index a37be124cc546..3132de891299c 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -10,6 +10,7 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - "git+git://github.com/dateutil/dateutil.git" - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index d1fe926744ecd..9710bcb5bf43d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -25,6 +25,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - python-dateutil==2.5.3 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index 74faeed83c387..093c055e69553 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 94d67b3d37788..e9db271a75d9d 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -27,3 +27,4 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 4915c003bce4e..71b224b2c68c2 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -44,6 +44,7 @@ dependencies: - pytest-mock - moto==1.3.4 - hypothesis>=3.58.0 + - isort - pip: - backports.lzma - pandas-gbq diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml index 26f3a17432ab2..1a65d292ef085 100644 --- a/ci/deps/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -43,3 +43,4 @@ dependencies: # universal - pytest - pytest-xdist + - isort diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 2a7692f10752c..36dbb8013104a 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -32,5 +32,6 @@ dependencies: - pytest-xdist - pytest-mock - moto + - isort - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 7934d179c8618..f4b9091c4300b 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 857c3fadfdaeb..e22529784b5ec 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -38,6 +38,7 @@ dependencies: - pytest-cov - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - brotlipy - coverage diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 125750191de7d..a8a5df5894ba5 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -17,5 +17,6 @@ dependencies: - pytest-mock - hypothesis>=3.58.0 - s3fs + - isort - pip: - moto diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e6928d9efde06..9e5034f6d3db0 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -33,8 +33,9 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): - if 'lat' not in obj.columns or 'lon' not in obj.columns: - raise AttributeError("Must have 'lat' and 'lon'.") + # verify there is a column latitude and a column longitude + if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + raise AttributeError("Must have 'latitude' and 'longitude'.") @property def center(self): diff --git a/doc/source/styled.xlsx b/doc/source/styled.xlsx new file mode 100644 index 0000000000000..1233ff2b8692b Binary files /dev/null and b/doc/source/styled.xlsx differ diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index a462f01dcd14f..7883814e91c94 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -335,7 +335,7 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 +.. versionadded:: 0.23.0 The ``limit_area`` keyword argument was added. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 23f1aabd69ff3..4e2c428415926 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -633,6 +633,16 @@ We are stopping on the included end-point as it is part of the index: dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] +.. versionadded:: 0.25.0 + +Slicing with string indexing also honors UTC offset. + +.. ipython:: python + + df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df + df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + .. _timeseries.slice_vs_exact_match: Slice vs. Exact Match diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index a7e522d27f8e2..8f4beb3f484a4 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -96,7 +96,7 @@ Bug Fixes **Other** - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) -- +- Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - .. _whatsnew_0.242.contributors: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6e225185ecf84..170e7f14da397 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -22,6 +22,8 @@ Other Enhancements - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) +- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - .. _whatsnew_0250.api_breaking: @@ -29,7 +31,37 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`Timestamp.strptime` will now raise a NotImplementedError (:issue:`25016`) +.. _whatsnew_0250.api_breaking.utc_offset_indexing: + +Indexing with date strings with UTC offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Indexing a :class:`DataFrame` or :class:`Series` with a :class:`DatetimeIndex` with a +date string with a UTC offset would previously ignore the UTC offset. Now, the UTC offset +is respected in indexing. (:issue:`24076`, :issue:`16785`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + + In [2]: df + Out[2]: + 0 + 2019-01-01 00:00:00-08:00 0 + + In [3]: df['2019-01-01 00:00:00+04:00':'2019-01-01 01:00:00+04:00'] + Out[3]: + 0 + 2019-01-01 00:00:00-08:00 0 + +*New Behavior*: + +.. ipython:: ipython + + df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] .. _whatsnew_0250.api.other: @@ -38,7 +70,7 @@ Other API Changes - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) - ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`) -- +- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`) - .. _whatsnew_0250.deprecations: @@ -64,7 +96,8 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- +- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is + int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) .. _whatsnew_0250.bug_fixes: @@ -160,6 +193,7 @@ I/O ^^^ - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) +- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - - - diff --git a/environment.yml b/environment.yml index 47fe8e4c2a640..ce68dccca0c07 100644 --- a/environment.yml +++ b/environment.yml @@ -20,6 +20,7 @@ dependencies: - isort - moto - pytest>=4.0 + - pytest-mock - sphinx - numpydoc diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index eb511b1adb28a..e86b692e9915e 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -150,9 +150,6 @@ cdef class Interval(IntervalMixin): Left bound for the interval. right : orderable scalar Right bound for the interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the interval is closed on the left-side, right-side, both or - neither. closed : {'right', 'left', 'both', 'neither'}, default 'right' Whether the interval is closed on the left-side, right-side, both or neither. See the Notes for more detailed explanation. diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d7ca7f8963f70..4036af85b7212 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -137,6 +137,7 @@ def lfilter(*args, **kwargs): reload = reload Hashable = collections.abc.Hashable Iterable = collections.abc.Iterable + Iterator = collections.abc.Iterator Mapping = collections.abc.Mapping MutableMapping = collections.abc.MutableMapping Sequence = collections.abc.Sequence @@ -199,6 +200,7 @@ def get_range_parameters(data): Hashable = collections.Hashable Iterable = collections.Iterable + Iterator = collections.Iterator Mapping = collections.Mapping MutableMapping = collections.MutableMapping Sequence = collections.Sequence diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7f5feb54f601a..4a71951e2435e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1729,6 +1729,89 @@ def func(arr, indexer, out, fill_value=np.nan): return out +# ------------ # +# searchsorted # +# ------------ # + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `arr` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `arr` would be preserved. + + Assuming that `arr` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``arr[i-1] < value <= self[i]`` + right ``arr[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + arr: array-like + Input array. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( + is_integer(value) or is_integer_dtype(value)): + from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value + value_ser = Series(value)._values + value = value_ser[0] if is_scalar(value) else value_ser + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result + + # ---- # # diff # # ---- # diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7aaefef3d03e5..e770281596134 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None): .. versionadded:: 0.24.0 Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. - Assuming that `a` is sorted: + Assuming that `self` is sorted: - ====== ============================ + ====== ================================ `side` returned index `i` satisfies - ====== ============================ - left ``self[i-1] < v <= self[i]`` - right ``self[i-1] <= v < self[i]`` - ====== ============================ + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ Parameters ---------- @@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None): Returns ------- - indices : array of ints + array of ints Array of insertion points with the same shape as `value`. See Also diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 791ff44303e96..8e2ab586cacb6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -4,6 +4,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -12,6 +13,7 @@ from pandas import compat from pandas.core import nanops +from pandas.core.algorithms import searchsorted from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -423,6 +425,11 @@ def to_numpy(self, dtype=None, copy=False): return result + @Appender(ExtensionArray.searchsorted.__doc__) + def searchsorted(self, value, side='left', sorter=None): + return searchsorted(self.to_numpy(), value, + side=side, sorter=sorter) + # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 7fdc64a8d9f85..f896596dd5216 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1522,11 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1): array([3]) """) - @Substitution(klass='IndexOpsMixin') + @Substitution(klass='Index') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - # needs coercion on the key (DatetimeIndex does already) - return self._values.searchsorted(value, side=side, sorter=sorter) + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afd8a1586689a..641b1e5f813a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -33,7 +33,7 @@ from pandas import compat from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u, - PY36, raise_with_traceback, + PY36, raise_with_traceback, Iterator, string_and_binary_types) from pandas.compat.numpy import function as nv from pandas.core.dtypes.cast import ( @@ -4025,7 +4025,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" - encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`abc.Iterator`. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4104,6 +4105,32 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not isinstance(keys, list): keys = [keys] + err_msg = ('The parameter "keys" may be a column key, one-dimensional ' + 'array, or a list containing only valid column keys and ' + 'one-dimensional arrays.') + + missing = [] + for col in keys: + if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, + list, Iterator)): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, 'ndim', 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError: + raise TypeError(err_msg + ' Received column of ' + 'type {}'.format(type(col))) + else: + if not found: + missing.append(col) + + if missing: + raise KeyError('None of {} are in the columns'.format(missing)) + if inplace: frame = self else: @@ -4132,6 +4159,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) + elif isinstance(col, Iterator): + arrays.append(list(col)) + names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) @@ -4139,6 +4169,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if drop: to_remove.append(col) + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError('Length mismatch: Expected {len_self} rows, ' + 'received array of length {len_col}'.format( + len_self=len(self), + len_col=len(arrays[-1]) + )) + index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 07ccf5979e5be..8cd27b81729a3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6656,7 +6656,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). - .. versionadded:: 0.21.0 + .. versionadded:: 0.23.0 downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dacbc00b929e5..dee181fc1c569 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6,9 +6,10 @@ import numpy as np from pandas._libs import ( - Timedelta, algos as libalgos, index as libindex, join as libjoin, lib, - tslibs) + algos as libalgos, index as libindex, join as libjoin, lib) from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp +from pandas._libs.tslibs.timezones import tz_compare import pandas.compat as compat from pandas.compat import range, set_function_name, u from pandas.compat.numpy import function as nv @@ -447,7 +448,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: pass elif inferred.startswith('timedelta'): @@ -4868,6 +4869,20 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # If it's a reverse slice, temporarily swap bounds. start, end = end, start + # GH 16785: If start and end happen to be date strings with UTC offsets + # attempt to parse and check that the offsets are the same + if (isinstance(start, (compat.string_types, datetime)) + and isinstance(end, (compat.string_types, datetime))): + try: + ts_start = Timestamp(start) + ts_end = Timestamp(end) + except (ValueError, TypeError): + pass + else: + if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): + raise ValueError("Both dates must have the " + "same UTC offset") + start_slice = None if start is not None: start_slice = self.get_slice_bound(start, 'left', kind) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c6d31339f950d..b494c41c3b58c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -42,20 +42,35 @@ typ='method', overwrite=True) class CategoricalIndex(Index, accessor.PandasDelegate): """ - Immutable Index implementing an ordered, sliceable set. CategoricalIndex - represents a sparsely populated Index with an underlying Categorical. + Index based on an underlying :class:`Categorical`. + + CategoricalIndex, like Categorical, can only take on a limited, + and usually fixed, number of possible values (`categories`). Also, + like Categorical, it might have an order, but numerical operations + (additions, divisions, ...) are not possible. Parameters ---------- - data : array-like or Categorical, (1-dimensional) - categories : optional, array-like - categories for the CategoricalIndex - ordered : boolean, - designating if the categories are ordered - copy : bool - Make a copy of input ndarray - name : object - Name to be stored in the index + data : array-like (1-dimensional) + The values of the categorical. If `categories` are given, values not in + `categories` will be replaced with NaN. + categories : index-like, optional + The categories for the categorical. Items need to be unique. + If the categories are not given here (and also not in `dtype`), they + will be inferred from the `data`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or the string "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + + .. versionadded:: 0.21.0 + copy : bool, default False + Make a copy of input ndarray. + name : object, optional + Name to be stored in the index. Attributes ---------- @@ -75,9 +90,45 @@ class CategoricalIndex(Index, accessor.PandasDelegate): as_unordered map + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + See Also -------- - Categorical, Index + Index : The base pandas Index type. + Categorical : A categorical array. + CategoricalDtype : Type for categorical data. + + Notes + ----- + See the `user guide + `_ + for more. + + Examples + -------- + >>> pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + ``CategoricalIndex`` can also be instantiated from a ``Categorical``: + + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.CategoricalIndex(c) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + Ordered ``CategoricalIndex`` can have a min and max value. + + >>> ci = pd.CategoricalIndex(['a','b','c','a','b','c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> ci + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['c', 'b', 'a'], ordered=True, dtype='category') # noqa + >>> ci.min() + 'c' """ _typ = 'categoricalindex' diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1037e2d9a3bd6..b8d052ce7be04 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -32,9 +32,8 @@ from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools -from pandas.tseries import offsets from pandas.tseries.frequencies import Resolution, to_offset -from pandas.tseries.offsets import CDay, prefix_mapping +from pandas.tseries.offsets import CDay, Nano, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -460,7 +459,7 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other): + def union(self, other, sort=None): """ Specialized union for DatetimeIndex objects. If combine overlapping ranges with the same DateOffset, will be much @@ -469,15 +468,29 @@ def union(self, other): Parameters ---------- other : DatetimeIndex or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result + + .. versionadded:: 0.25.0 Returns ------- y : Index or DatetimeIndex """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other) or len(self) == 0: - return super(DatetimeIndex, self).union(other) + return super(DatetimeIndex, self).union(other, sort=sort) if not isinstance(other, DatetimeIndex): try: @@ -488,9 +501,9 @@ def union(self, other): this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): - return this._fast_union(other) + return this._fast_union(other, sort=sort) else: - result = Index.union(this, other) + result = Index.union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds @@ -563,16 +576,28 @@ def _can_fast_union(self, other): # this will raise return False - def _fast_union(self, other): + def _fast_union(self, other, sort=None): if len(other) == 0: return self.view(type(self)) if len(self) == 0: return other.view(type(self)) - # to make our life easier, "sort" the two ranges + # Both DTIs are monotonic. Check if they are already + # in the "correct" order if self[0] <= other[0]: left, right = self, other + # DTIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + elif sort is False: + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side='left') + right_chunk = right.values[:loc] + dates = _concat._concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + # DTIs are not in the "correct" order and we want + # to sort else: left, right = other, self @@ -826,54 +851,57 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ + valid_resos = {'year', 'month', 'quarter', 'day', 'hour', 'minute', + 'second', 'minute', 'second', 'microsecond'} + if reso not in valid_resos: + raise KeyError if reso == 'year': - return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), - Timestamp(datetime(parsed.year, 12, 31, 23, - 59, 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, 1, 1) + end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) elif reso == 'month': d = ccalendar.get_days_in_month(parsed.year, parsed.month) - return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), - Timestamp(datetime(parsed.year, parsed.month, d, 23, - 59, 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month - return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), - Timestamp(datetime(parsed.year, qe, d, 23, 59, - 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) elif reso == 'day': - st = datetime(parsed.year, parsed.month, parsed.day) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Day(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day) + end = start + timedelta(days=1) - Nano(1) elif reso == 'hour': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Hour(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour) + end = start + timedelta(hours=1) - Nano(1) elif reso == 'minute': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour, minute=parsed.minute) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Minute(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute) + end = start + timedelta(minutes=1) - Nano(1) elif reso == 'second': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour, minute=parsed.minute, - second=parsed.second) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Second(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute, parsed.second) + end = start + timedelta(seconds=1) - Nano(1) elif reso == 'microsecond': - st = datetime(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second, - parsed.microsecond) - return (Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz)) - else: - raise KeyError + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute, parsed.second, + parsed.microsecond) + end = start + timedelta(microseconds=1) - Nano(1) + # GH 24076 + # If an incoming date string contained a UTC offset, need to localize + # the parsed date to this offset first before aligning with the index's + # timezone + if parsed.tzinfo is not None: + if self.tz is None: + raise ValueError("The index must be timezone aware " + "when indexing with a date string with a " + "UTC offset") + start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) + end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) + elif self.tz is not None: + start = start.tz_localize(self.tz) + end = end.tz_localize(self.tz) + return start, end def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 640587b7f9f31..cb98274962656 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -190,6 +190,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_sparse', False): pass + elif getattr(self.block, 'is_extension', False): + pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) diff --git a/pandas/core/series.py b/pandas/core/series.py index 15c4c736aafb4..cada6663ce651 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2394,12 +2394,8 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - if sorter is not None: - sorter = ensure_platform_int(sorter) - result = self._values.searchsorted(Series(value)._values, - side=side, sorter=sorter) - - return result[0] if is_scalar(value) else result + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c57d27ff03ac6..7d5a7f1a99e41 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -9,10 +9,10 @@ class PerformanceWarning(Warning): """ - Warning raised when there is a possible - performance impact. + Warning raised when there is a possible performance impact. """ + class UnsupportedFunctionCall(ValueError): """ Exception raised when attempting to call a numpy function @@ -20,6 +20,7 @@ class UnsupportedFunctionCall(ValueError): the object e.g. ``np.cumsum(groupby_object)``. """ + class UnsortedIndexError(KeyError): """ Error raised when attempting to get a slice of a MultiIndex, @@ -31,7 +32,15 @@ class UnsortedIndexError(KeyError): class ParserError(ValueError): """ - Exception that is raised by an error encountered in `pd.read_csv`. + Exception that is raised by an error encountered in parsing file contents. + + This is a generic error raised for errors encountered when functions like + `read_csv` or `read_html` are parsing contents of a file. + + See Also + -------- + read_csv : Read CSV (comma-separated) file into a DataFrame. + read_html : Read HTML table into a DataFrame. """ @@ -180,4 +189,4 @@ def __str__(self): else: name = self.class_instance.__class__.__name__ msg = "This {methodtype} must be defined in the concrete class {name}" - return (msg.format(methodtype=self.methodtype, name=name)) + return msg.format(methodtype=self.methodtype, name=name) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4bbccc8339d7c..725e2d28ffd67 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -226,7 +226,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, return serialized -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): @@ -278,8 +278,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True - If True, infer dtypes, if a dict of column to dtype, then use those, + If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. + + Not applicable with ``orient='table'``. + + .. versionchanged:: 0.25 + + Not applicable with ``orient='table'``. + convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True @@ -408,6 +415,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ + if orient == 'table' and dtype: + raise ValueError("cannot pass both dtype and orient='table'") + + dtype = orient != 'table' if dtype is None else dtype + compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -600,15 +612,15 @@ class Parser(object): 'us': long(31536000000000), 'ns': long(31536000000000000)} - def __init__(self, json, orient, dtype=True, convert_axes=True, + def __init__(self, json, orient, dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False, precise_float=False, date_unit=None): self.json = json if orient is None: orient = self._default_orient - self.orient = orient + self.dtype = dtype if orient == "split": diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 9fea1989e46df..b68ec2bf348b4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -9,6 +9,7 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype +from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) @@ -254,3 +255,51 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) + + +class TestArrayAnalytics(object): + def test_searchsorted(self, string_dtype): + arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + + result = arr.searchsorted('a', side='left') + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted('a', side='right') + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('arr, val', [ + [pd.date_range('20120101', periods=10, freq='2D'), + pd.Timestamp('20120102')], + [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), + pd.Timestamp('20120102', tz='Asia/Hong_Kong')], + [pd.timedelta_range(start='1 day', end='10 days', periods=10), + pd.Timedelta('2 days')]]) + def test_search_sorted_datetime64_scalar(self, arr, val): + arr = pd.array(arr) + result = arr.searchsorted(val) + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_sorter(self, any_real_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 377e737a53158..69ee614ab8d2a 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -29,16 +29,6 @@ def float_frame_with_na(): return df -@pytest.fixture -def float_frame2(): - """ - Fixture for DataFrame of floats with index of unique strings - - Columns are ['D', 'C', 'B', 'A'] - """ - return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A']) - - @pytest.fixture def bool_frame_with_na(): """ @@ -104,21 +94,6 @@ def mixed_float_frame(): return df -@pytest.fixture -def mixed_float_frame2(): - """ - Fixture for DataFrame of different float types with index of unique strings - - Columns are ['A', 'B', 'C', 'D']. - """ - df = DataFrame(tm.getSeriesData()) - df.D = df.D.astype('float32') - df.C = df.C.astype('float32') - df.B = df.B.astype('float16') - df.D = df.D.astype('float64') - return df - - @pytest.fixture def mixed_int_frame(): """ @@ -135,19 +110,6 @@ def mixed_int_frame(): return df -@pytest.fixture -def mixed_type_frame(): - """ - Fixture for DataFrame of float/int/string columns with RangeIndex - - Columns are ['a', 'b', 'c', 'float32', 'int32']. - """ - return DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) - - @pytest.fixture def timezone_frame(): """ @@ -173,22 +135,6 @@ def empty_frame(): return DataFrame({}) -@pytest.fixture -def datetime_series(): - """ - Fixture for Series of floats with DatetimeIndex - """ - return tm.makeTimeSeries(nper=30) - - -@pytest.fixture -def datetime_series_short(): - """ - Fixture for Series of floats with DatetimeIndex - """ - return tm.makeTimeSeries(nper=30)[5:] - - @pytest.fixture def simple_frame(): """ diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index cc3687f856b4e..a25e893e08900 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -178,10 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, + @pytest.mark.parametrize('box2', [Series, Index, np.array, list, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, + @pytest.mark.parametrize('box1', [Series, Index, np.array, list, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -195,6 +195,9 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] result = df.set_index(keys, drop=drop, append=append) + # if either box is iter, it has been consumed; re-read + keys = [box1(df['A']), box2(df['A'])] + # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers @@ -253,25 +256,48 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df.set_index(['A', df['A'], tuple(df['A'])], drop=drop, append=append) - @pytest.mark.xfail(reason='broken due to revert, see GH 25085') @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)], - ids=['set', 'iter', 'generator']) + @pytest.mark.parametrize('box', [set], ids=['set']) def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' - # forbidden type, e.g. set/iter/generator + # forbidden type, e.g. set with pytest.raises(TypeError, match=msg): df.set_index(box(df['A']), drop=drop, append=append) - # forbidden type in list, e.g. set/iter/generator + # forbidden type in list, e.g. set with pytest.raises(TypeError, match=msg): df.set_index(['A', df['A'], box(df['A'])], drop=drop, append=append) + # MultiIndex constructor does not work directly on Series -> lambda + @pytest.mark.parametrize('box', [Series, Index, np.array, iter, + lambda x: MultiIndex.from_arrays([x])], + ids=['Series', 'Index', 'np.array', + 'iter', 'MultiIndex']) + @pytest.mark.parametrize('length', [4, 6], ids=['too_short', 'too_long']) + @pytest.mark.parametrize('append', [True, False]) + @pytest.mark.parametrize('drop', [True, False]) + def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, + drop, append): + # GH 24984 + df = frame_of_index_cols # has length 5 + + values = np.random.randint(0, 10, (length,)) + + msg = 'Length mismatch: Expected 5 rows, received array of length.*' + + # wrong length directly + with pytest.raises(ValueError, match=msg): + df.set_index(box(values), drop=drop, append=append) + + # wrong length in list + with pytest.raises(ValueError, match=msg): + df.set_index(['A', df.A, box(values)], drop=drop, append=append) + def test_set_index_custom_label_type(self): # GH 24969 @@ -341,7 +367,7 @@ def __repr__(self): # missing key thing3 = Thing(['Three', 'pink']) - msg = '.*' # due to revert, see GH 25085 + msg = r"frozenset\(\{'Three', 'pink'\}\)" with pytest.raises(KeyError, match=msg): # missing label directly df.set_index(thing3) @@ -366,7 +392,7 @@ def __str__(self): thing2 = Thing('Two', 'blue') df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) - msg = 'unhashable type.*' + msg = 'The parameter "keys" may be a column key, .*' with pytest.raises(TypeError, match=msg): # use custom label directly diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 2e690ebbfa121..43a45bb915819 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas.compat import PY35, lrange +from pandas.compat import PY2, PY35, is_platform_windows, lrange import pandas.util._test_decorators as td import pandas as pd @@ -1842,6 +1842,17 @@ def test_numpy_round(self): with pytest.raises(ValueError, match=msg): np.round(df, decimals=0, out=df) + @pytest.mark.xfail( + PY2 and is_platform_windows(), reason="numpy/numpy#7882", + raises=AssertionError, strict=True) + def test_numpy_round_nan(self): + # See gh-14197 + df = Series([1.53, np.nan, 0.06]).to_frame() + with tm.assert_produces_warning(None): + result = df.round() + expected = Series([2., np.nan, 0.]).to_frame() + tm.assert_frame_equal(result, expected) + def test_round_mixed_type(self): # GH 11885 df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4], diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 10c42e0d1a1cf..6bb9dea15d1ce 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -310,6 +310,7 @@ def test_rank_pct_true(self, method, exp): tm.assert_frame_equal(result, expected) @pytest.mark.single + @pytest.mark.high_memory def test_pct_max_many_rows(self): # GH 18271 df = DataFrame({'A': np.arange(2**24 + 1), diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e1ba0e1708442..a3ee5fe39769f 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -102,7 +102,7 @@ def test_stringified_slice_with_tz(self): # GH#2658 import datetime start = datetime.datetime.now() - idx = date_range(start=start, freq="1d", periods=10) + idx = date_range(start=start, freq="1d", periods=10, tz='US/Eastern') df = DataFrame(lrange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index a0c9d9f02385c..64693324521b3 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -396,3 +396,30 @@ def test_selection_by_datetimelike(self, datetimelike, op, expected): result = op(df.A, datetimelike) expected = Series(expected, name='A') tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('start', [ + '2018-12-02 21:50:00+00:00', pd.Timestamp('2018-12-02 21:50:00+00:00'), + pd.Timestamp('2018-12-02 21:50:00+00:00').to_pydatetime() + ]) + @pytest.mark.parametrize('end', [ + '2018-12-02 21:52:00+00:00', pd.Timestamp('2018-12-02 21:52:00+00:00'), + pd.Timestamp('2018-12-02 21:52:00+00:00').to_pydatetime() + ]) + def test_getitem_with_datestring_with_UTC_offset(self, start, end): + # GH 24076 + idx = pd.date_range(start='2018-12-02 14:50:00-07:00', + end='2018-12-02 14:50:00-07:00', freq='1min') + df = pd.DataFrame(1, index=idx, columns=['A']) + result = df[start:end] + expected = df.iloc[0:3, :] + tm.assert_frame_equal(result, expected) + + # GH 16785 + start = str(start) + end = str(end) + with pytest.raises(ValueError, match="Both dates must"): + df[start:end[:-4] + '1:00'] + + with pytest.raises(ValueError, match="The index must be timezone"): + df = df.tz_localize(None) + df[start:end] diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 19009e45ee83a..cf1f75234ec62 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -21,83 +21,107 @@ class TestDatetimeIndexSetOps(object): 'dateutil/US/Pacific'] # TODO: moved from test_datetimelike; dedup with version below - def test_union2(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union2(self, sort): everything = tm.makeDateIndex(10) first = everything[:5] second = everything[5:] - union = first.union(second) - assert tm.equalContents(union, everything) + union = first.union(second, sort=sort) + tm.assert_index_equal(union, everything) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - result = first.union(case) - assert tm.equalContents(result, everything) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("tz", tz) - def test_union(self, tz): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, tz, sort): rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + expected3_notsorted = rng3 - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: + for rng, other, exp, exp_notsorted in [(rng1, other1, expected1, + expected1_notsorted), + (rng2, other2, expected2, + expected2_notsorted), + (rng3, other3, expected3, + expected3_notsorted)]: - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) + result_union = rng.union(other, sort=sort) + tm.assert_index_equal(result_union, exp) - def test_union_coverage(self): + result_union = other.union(rng, sort=sort) + if sort is None: + tm.assert_index_equal(result_union, exp) + else: + tm.assert_index_equal(result_union, exp_notsorted) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_coverage(self, sort): idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) ordered = DatetimeIndex(idx.sort_values(), freq='infer') - result = ordered.union(idx) + result = ordered.union(idx, sort=sort) tm.assert_index_equal(result, ordered) - result = ordered[:0].union(ordered) + result = ordered[:0].union(ordered, sort=sort) tm.assert_index_equal(result, ordered) assert result.freq == ordered.freq - def test_union_bug_1730(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1730(self, sort): rng_a = date_range('1/1/2012', periods=4, freq='3H') rng_b = date_range('1/1/2012', periods=4, freq='4H') - result = rng_a.union(rng_b) + result = rng_a.union(rng_b, sort=sort) exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) tm.assert_index_equal(result, exp) - def test_union_bug_1745(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1745(self, sort): left = DatetimeIndex(['2012-05-11 15:19:49.695000']) right = DatetimeIndex(['2012-05-29 13:04:21.322000', '2012-05-11 15:27:24.873000', '2012-05-11 15:31:05.350000']) - result = left.union(right) - exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + result = left.union(right, sort=sort) + exp = DatetimeIndex(['2012-05-11 15:19:49.695000', + '2012-05-29 13:04:21.322000', + '2012-05-11 15:27:24.873000', + '2012-05-11 15:31:05.350000']) + if sort is None: + exp = exp.sort_values() tm.assert_index_equal(result, exp) - def test_union_bug_4564(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_4564(self, sort): from pandas import DateOffset left = date_range("2013-01-01", "2013-02-01") right = left + DateOffset(minutes=15) - result = left.union(right) + result = left.union(right, sort=sort) exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) tm.assert_index_equal(result, exp) - def test_union_freq_both_none(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_freq_both_none(self, sort): # GH11086 expected = bdate_range('20150101', periods=10) expected.freq = None - result = expected.union(expected) + result = expected.union(expected, sort=sort) tm.assert_index_equal(result, expected) assert result.freq is None @@ -112,11 +136,14 @@ def test_union_dataframe_index(self): exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') tm.assert_index_equal(df.index, exp) - def test_union_with_DatetimeIndex(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D') - i1.union(i2) # Works - i2.union(i1) # Fails with "AttributeError: can't set attribute" + # Works + i1.union(i2, sort=sort) + # Fails with "AttributeError: can't set attribute" + i2.union(i1, sort=sort) # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): @@ -262,11 +289,12 @@ def test_datetimeindex_diff(self, sort): periods=98) assert len(dti1.difference(dti2, sort)) == 2 - def test_datetimeindex_union_join_empty(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_datetimeindex_union_join_empty(self, sort): dti = date_range(start='1/1/2001', end='2/1/2001', freq='D') empty = Index([]) - result = dti.union(empty) + result = dti.union(empty, sort=sort) assert isinstance(result, DatetimeIndex) assert result is result @@ -287,35 +315,40 @@ class TestBusinessDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END) - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + else: + expected = pd.DatetimeIndex(list(right) + list(left)) + tm.assert_index_equal(right.union(left, sort=sort), expected) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self): @@ -350,16 +383,21 @@ def test_outer_join(self): assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None - def test_union_not_cacheable(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_not_cacheable(self, sort): rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] - the_union = rng1.union(rng2) - tm.assert_index_equal(the_union, rng) + the_union = rng1.union(rng2, sort=sort) + if sort is None: + tm.assert_index_equal(the_union, rng) + else: + expected = pd.DatetimeIndex(list(rng[10:]) + list(rng[:10])) + tm.assert_index_equal(the_union, expected) rng1 = rng[10:] rng2 = rng[15:35] - the_union = rng1.union(rng2) + the_union = rng1.union(rng2, sort=sort) expected = rng[10:] tm.assert_index_equal(the_union, expected) @@ -388,7 +426,8 @@ def test_intersection_bug(self): result = a.intersection(b) tm.assert_index_equal(result, b) - def test_month_range_union_tz_pytz(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone tz = timezone('US/Eastern') @@ -403,10 +442,11 @@ def test_month_range_union_tz_pytz(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) @td.skip_if_windows_python_3 - def test_month_range_union_tz_dateutil(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz tz = dateutil_gettz('US/Eastern') @@ -421,7 +461,7 @@ def test_month_range_union_tz_dateutil(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) class TestCustomDatetimeIndex(object): @@ -429,35 +469,37 @@ class TestCustomDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END, freq='C') - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 6fa3b5b3b2ed4..3002d1dfb5f8a 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -502,12 +502,12 @@ class TestTableOrientReader(object): @pytest.mark.parametrize("vals", [ {'ints': [1, 2, 3, 4]}, {'objects': ['a', 'b', 'c', 'd']}, + {'objects': ['1', '2', '3', '4']}, {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)}, {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))}, {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], ordered=True))}, - pytest.param({'floats': [1., 2., 3., 4.]}, - marks=pytest.mark.xfail), + {'floats': [1., 2., 3., 4.]}, {'floats': [1.1, 2.2, 3.3, 4.4]}, {'bools': [True, False, False, True]}]) def test_read_json_table_orient(self, index_nm, vals, recwarn): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0ffc8c978a228..fecd0f0572757 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1202,6 +1202,21 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + def test_from_json_to_json_table_dtypes(self): + # GH21345 + expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = expected.to_json(orient='table') + result = pd.read_json(dfjson, orient='table') + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('dtype', [True, {'b': int, 'c': int}]) + def test_read_json_table_dtype_raises(self, dtype): + # GH21345 + df = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = df.to_json(orient='table') + with pytest.raises(ValueError): + pd.read_json(dfjson, orient='table', dtype=dtype) + @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}), diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 25487ccc76e62..7a97368504fd6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -39,6 +39,54 @@ def get_test_data(ngroups=NGROUPS, n=N): return arr +def get_series(): + return [ + pd.Series([1], dtype='int64'), + pd.Series([1], dtype='Int64'), + pd.Series([1.23]), + pd.Series(['foo']), + pd.Series([True]), + pd.Series([pd.Timestamp('2018-01-01')]), + pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + ] + + +def get_series_na(): + return [ + pd.Series([np.nan], dtype='Int64'), + pd.Series([np.nan], dtype='float'), + pd.Series([np.nan], dtype='object'), + pd.Series([pd.NaT]), + ] + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype(request): + """ + A parametrized fixture returning a variety of Series of different + dtypes + """ + return request.param + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype2(request): + """ + A duplicate of the series_of_dtype fixture, so that it can be used + twice by a single function + """ + return request.param + + +@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) +def series_of_dtype_all_na(request): + """ + A parametrized fixture returning a variety of Series with all NA + values + """ + return request.param + + class TestMerge(object): def setup_method(self, method): @@ -428,6 +476,36 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) + def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): + # GH 25183 + df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2}, + columns=['key', 'value']) + df_empty = df[:0] + expected = pd.DataFrame({ + 'value_x': pd.Series(dtype=df.dtypes['value']), + 'key': pd.Series(dtype=df.dtypes['key']), + 'value_y': pd.Series(dtype=df.dtypes['value']), + }, columns=['value_x', 'key', 'value_y']) + actual = df_empty.merge(df, on='key') + assert_frame_equal(actual, expected) + + def test_merge_all_na_column(self, series_of_dtype, + series_of_dtype_all_na): + # GH 25183 + df_left = pd.DataFrame( + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) + df_right = pd.DataFrame( + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) + expected = pd.DataFrame({ + 'key': series_of_dtype, + 'value_x': series_of_dtype_all_na, + 'value_y': series_of_dtype_all_na, + }, columns=['key', 'value_x', 'value_y']) + actual = df_left.merge(df_right, on='key') + assert_frame_equal(actual, expected) + def test_merge_nosort(self): # #2098, anything to do? diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6811e370726b2..1f265d574da15 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -9,7 +9,7 @@ from numpy import nan import pytest -from pandas.compat import PY35, lrange, range +from pandas.compat import PY2, PY35, is_platform_windows, lrange, range import pandas.util._test_decorators as td import pandas as pd @@ -285,6 +285,17 @@ def test_numpy_round(self): with pytest.raises(ValueError, match=msg): np.round(s, decimals=0, out=s) + @pytest.mark.xfail( + PY2 and is_platform_windows(), reason="numpy/numpy#7882", + raises=AssertionError, strict=True) + def test_numpy_round_nan(self): + # See gh-14197 + s = Series([1.53, np.nan, 0.06]) + with tm.assert_produces_warning(None): + result = s.round() + expected = Series([2., np.nan, 0.]) + assert_series_equal(result, expected) + def test_built_in_round(self): if not compat.PY3: pytest.skip( diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 510a51e002918..dfcda889269ee 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -499,6 +499,7 @@ def test_rank_first_pct(dtype, ser, exp): @pytest.mark.single +@pytest.mark.high_memory def test_pct_max_many_rows(): # GH 18271 s = Series(np.arange(2**24 + 1)) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 888cf78a1c66a..cb7426ce2f7c9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1484,6 +1484,7 @@ def test_too_many_ndims(self): algos.rank(arr) @pytest.mark.single + @pytest.mark.high_memory @pytest.mark.parametrize('values', [ np.arange(2**24 + 1), np.arange(2**25 + 2).reshape(2**24 + 1, 2)], diff --git a/requirements-dev.txt b/requirements-dev.txt index 76aaeefa648f4..22c01ebcef7f0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ hypothesis>=3.82 isort moto pytest>=4.0 +pytest-mock sphinx numpydoc beautifulsoup4>=4.2.1 diff --git a/setup.cfg b/setup.cfg index b15c3ce8a110a..956aa23839e73 100644 --- a/setup.cfg +++ b/setup.cfg @@ -152,3 +152,23 @@ skip= asv_bench/benchmarks/dtypes.py asv_bench/benchmarks/strings.py asv_bench/benchmarks/period.py + pandas/__init__.py + pandas/plotting/__init__.py + pandas/tests/extension/decimal/__init__.py + pandas/tests/extension/base/__init__.py + pandas/io/msgpack/__init__.py + pandas/io/json/__init__.py + pandas/io/clipboard/__init__.py + pandas/io/excel/__init__.py + pandas/compat/__init__.py + pandas/compat/numpy/__init__.py + pandas/core/arrays/__init__.py + pandas/core/groupby/__init__.py + pandas/core/internals/__init__.py + pandas/api/__init__.py + pandas/api/extensions/__init__.py + pandas/api/types/__init__.py + pandas/_libs/__init__.py + pandas/_libs/tslibs/__init__.py + pandas/util/__init__.py + pandas/arrays/__init__.py