From b7069efa4c1a074a7adcb37ed810774bae94ef5e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Feb 2018 15:29:55 -0600 Subject: [PATCH 001/119] ENH: non-interval changes --- pandas/core/arrays/base.py | 14 +- pandas/core/dtypes/missing.py | 16 +- pandas/core/frame.py | 20 +- pandas/core/indexes/base.py | 34 ++- pandas/core/indexes/category.py | 3 + pandas/core/indexes/datetimes.py | 7 + pandas/core/internals.py | 29 +- pandas/core/series.py | 31 ++- pandas/tests/extension_arrays/__init__.py | 0 pandas/tests/extension_arrays/base.py | 312 ++++++++++++++++++++++ pandas/tests/indexes/datetimelike.py | 11 + pandas/tests/indexes/test_base.py | 5 + pandas/tests/indexes/test_category.py | 5 + 13 files changed, 447 insertions(+), 40 deletions(-) create mode 100644 pandas/tests/extension_arrays/__init__.py create mode 100644 pandas/tests/extension_arrays/base.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1556b653819a6..8d44c5133f740 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,6 @@ """An interface for extending pandas with custom arrays.""" +import numpy as np + from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." @@ -24,7 +26,6 @@ class ExtensionArray(object): * take * copy * _formatting_values - * _concat_same_type Some additional methods are required to satisfy pandas' internal, private block API. @@ -51,9 +52,6 @@ class ExtensionArray(object): Extension arrays should be able to be constructed with instances of the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. - - Additionally, certain methods and interfaces are required for proper - this array to be properly stored inside a ``DataFrame`` or ``Series``. """ # ------------------------------------------------------------------------ # Must be a Sequence @@ -177,9 +175,9 @@ def take(self, indexer, allow_fill=True, fill_value=None): Examples -------- - Suppose the extension array somehow backed by a NumPy structured array - and that the underlying structured array is stored as ``self.data``. - Then ``take`` may be written as + Suppose the extension array somehow backed by a NumPy array and that + the underlying structured array is stored as ``self.data``. Then + ``take`` may be written as .. code-block:: python @@ -219,7 +217,7 @@ def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype """An array of values to be printed in, e.g. the Series repr""" - raise AbstractMethodError(self) + raise np.array(self) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ffac702476af1..4ed6ddec00289 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -13,6 +13,7 @@ is_complex_dtype, is_categorical_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, + is_extension_array_dtype, needs_i8_conversion, _ensure_object, pandas_dtype, is_scalar, @@ -52,12 +53,15 @@ def isna(obj): def _isna_new(obj): + from ..arrays import ExtensionArray + if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, + ExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) @@ -124,11 +128,14 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): + from ..arrays import ExtensionArray values = getattr(obj, 'values', obj) dtype = values.dtype - if is_string_dtype(dtype): + if isinstance(values, ExtensionArray): + result = values.isna() + elif is_string_dtype(dtype): if is_categorical_dtype(values): from pandas import Categorical if not isinstance(values, Categorical): @@ -406,4 +413,7 @@ def remove_na_arraylike(arr): """ Return array-like containing only true/non-NaN values, possibly empty. """ - return arr[notna(lib.values_from_object(arr))] + if is_extension_array_dtype(arr): + return arr[notna(arr)] + else: + return arr[notna(lib.values_from_object(arr))] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96d28581cfdd9..ea3e7b33fb5b6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,6 +39,7 @@ is_categorical_dtype, is_object_dtype, is_extension_type, + is_extension_array_dtype, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, @@ -71,7 +72,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values), 1) return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif is_datetimetz(values): + elif (is_datetimetz(values) or is_extension_array_dtype(values)): # GH19157 if columns is None: columns = [0] @@ -2796,7 +2797,7 @@ def reindexer(value): # now align rows value = reindexer(value).T - elif isinstance(value, Categorical): + elif isinstance(value, ExtensionArray): value = value.copy() elif isinstance(value, Index) or is_sequence(value): @@ -2804,7 +2805,7 @@ def reindexer(value): # turn me into an ndarray value = _sanitize_index(value, self.index, copy=False) - if not isinstance(value, (np.ndarray, Index)): + if not isinstance(value, (np.ndarray, Index, ExtensionArray)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) else: @@ -2826,7 +2827,7 @@ def reindexer(value): value = maybe_cast_to_datetime(value, value.dtype) # return internal types directly - if is_extension_type(value): + if is_extension_type(value) or is_extension_array_dtype(value): return value # broadcast across multiple columns if necessary @@ -3355,12 +3356,9 @@ class max type new_obj = self.copy() def _maybe_casted_values(index, labels=None): - if isinstance(index, PeriodIndex): - values = index.astype(object).values - elif isinstance(index, DatetimeIndex) and index.tz is not None: - values = index - else: - values = index.values + values = index._as_best_array() + # TODO: Check if nescessary... + if not isinstance(index, (PeriodIndex, DatetimeIndex)): if values.dtype == np.object_: values = lib.maybe_convert_objects(values) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 626f3dc86556a..37a408a8f6c11 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,6 +13,7 @@ from pandas import compat from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.generic import ( ABCSeries, ABCDataFrame, ABCMultiIndex, @@ -1038,6 +1039,31 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() + def _as_best_array(self): + # type: () -> Union[ExtensionArray, ndarary] + """Return the underlying values as the best array type. + + Indexes backed by ExtensionArrays will return the ExtensionArray. + Otherwise, an ndarray is returned. + + Examples + -------- + >>> pd.Index([0, 1, 2])._as_best_array() + array([0, 1, 2]) + + >>> pd.CategoricalIndex(['a', 'a', 'b'])._as_best_array() + [a, a, b] + Categories (2, object): [a, b] + + >>> pd.IntervalIndex.from_breaks([0, 1, 2])._as_best_array() + IntervalArray([(0, 1], (1, 2]]) + """ + # We need this since CategoricalIndex.values -> Categorical + # but IntervalIndex.values -> ndarray[object] + # TODO: IntervalIndex defines _array_values. Would be nice to + # have an unambiguous way of getting an ndarray (or just use asarray?) + return self.values + _index_shared_docs['astype'] = """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a ValueError @@ -1946,6 +1972,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): if is_categorical_dtype(values.dtype): values = np.array(values) + + elif isinstance(values, ExtensionArray): + # This is still un-exercised within pandas, since all our + # extension dtypes have custom indexes. + values = values._formatting_values() + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) @@ -2525,7 +2557,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and is_scalar(key): + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): try: return s[key] except (IndexError, ValueError): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..90541c58b2ef9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -297,6 +297,9 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() + def _as_best_array(self): + return self._data + def tolist(self): return self._data.tolist() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e09fa87477122..2ba010aeba467 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1034,6 +1034,13 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() + def _as_best_array(self): + # no-tz -> ndarray + # tz -> DatetimeIndex (for now) + if self.tz is not None: + return self + return self.values + def to_pydatetime(self): """ Return DatetimeIndex as object ndarray of datetime.datetime objects diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f553e1a02c9d6..364a3b2ae027e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,7 +56,10 @@ is_null_datelike_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex +from pandas.core.dtypes.generic import ( + ABCSeries, + ABCDatetimeIndex, + ABCIndexClass) import pandas.core.common as com import pandas.core.algorithms as algos @@ -1854,6 +1857,20 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super().__init__(values, placement, ndim) + + def _maybe_coerce_values(self, values): + # Unboxes Series / Index + # Doesn't change any underlying dtypes. + if isinstance(values, ABCSeries): + values = values.values + elif isinstance(values, ABCIndexClass): + values = values._as_best_array() + return values + @property def _holder(self): # For extension blocks, the holder is values-dependent. @@ -4101,7 +4118,8 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_extension_type = is_extension_type(value) + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) # categorical/spares/datetimetz if value_is_extension_type: @@ -4834,13 +4852,10 @@ def form_blocks(arrays, names, axes): if len(items_dict['ExtensionBlock']): external_blocks = [] + for i, _, array in items_dict['ExtensionBlock']: - if isinstance(array, ABCSeries): - array = array.values - # Allow our internal arrays to chose their block type. - block_type = getattr(array, '_block_type', ExtensionBlock) external_blocks.append( - make_block(array, klass=block_type, + make_block(array, klass=ExtensionBlock, fastpath=True, placement=[i])) blocks.extend(external_blocks) diff --git a/pandas/core/series.py b/pandas/core/series.py index 78b4c3a70a519..6bd6bfc1c8ae9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,6 +14,7 @@ import numpy.ma as ma from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.common import ( is_categorical_dtype, is_bool, @@ -173,12 +174,15 @@ def __init__(self, data=None, index=None, dtype=None, name=None, raise NotImplementedError("initializing a Series from a " "MultiIndex is not supported") elif isinstance(data, Index): - # need to copy to avoid aliasing issues if name is None: name = data.name - data = data._to_embed(keep_tz=True, dtype=dtype) - copy = False + if dtype is not None: + data = data.astype(dtype) + + # need to copy to avoid aliasing issues + data = data._as_best_array().copy() + elif isinstance(data, np.ndarray): pass elif isinstance(data, Series): @@ -234,6 +238,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, copy=copy) elif copy: data = data.copy() + elif isinstance(data, ExtensionArray): + if copy: + data = data.copy() + data = SingleBlockManager(data, index, fastpath=True) else: data = _sanitize_array(data, index, dtype, copy, raise_cast_failure=True) @@ -2570,7 +2578,11 @@ def _reindex_indexer(self, new_index, indexer, copy): return self # be subclass-friendly - new_values = algorithms.take_1d(self.get_values(), indexer) + if isinstance(self.values, ExtensionArray): + new_values = self.values.take(indexer) + else: + new_values = algorithms.take_1d(self.get_values(), indexer) + return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3117,11 +3129,8 @@ def _sanitize_index(data, index, copy=False): raise ValueError('Length of values does not match length of ' 'index') if isinstance(data, ABCIndexClass) and not copy: - pass - elif isinstance(data, PeriodIndex): - data = data.astype(object).values - elif isinstance(data, DatetimeIndex): - data = data._to_embed(keep_tz=True) + data = data._as_best_array() + elif isinstance(data, np.ndarray): # coerce datetimelike types @@ -3194,11 +3203,12 @@ def _try_cast(arr, take_fast_path): # we will try to copy be-definition here subarr = _try_cast(data, True) - elif isinstance(data, Categorical): + elif isinstance(data, ExtensionArray): subarr = data if copy: subarr = data.copy() + # XXX: This is the only early return. See if it can be avoided. return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: @@ -3221,6 +3231,7 @@ def _try_cast(arr, take_fast_path): start, stop, step = get_range_parameters(data) arr = np.arange(start, stop, step, dtype='int64') subarr = _try_cast(arr, False) + else: subarr = _try_cast(data, False) diff --git a/pandas/tests/extension_arrays/__init__.py b/pandas/tests/extension_arrays/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py new file mode 100644 index 0000000000000..7e91ea661721e --- /dev/null +++ b/pandas/tests/extension_arrays/base.py @@ -0,0 +1,312 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.compat import StringIO +from pandas.core.internals import ExtensionBlock +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class BaseDtypeTests(object): + """Base class for ExtensionDtype classes""" + + @pytest.fixture + def dtype(self): + """A fixture providing the ExtensionDtype to validate.""" + raise NotImplementedError + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set('biufcmMOSUV') + if dtype.kind is not None: + assert dtype.kind in valid + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + +class BaseArrayTests(object): + """Base class for extension array classes. + + Subclasses should implement the following fixtures + + * test_data + * test_data_missing + """ + + @pytest.fixture + def test_data(self): + """Length-100 array for this type.""" + raise NotImplementedError + + @pytest.fixture + def test_data_missing(self): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + @pytest.fixture + def na_cmp(self): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By defult, uses ``operator.or`` + """ + return operator.is_ + + def test_len(self, test_data): + assert len(test_data) == 100 + + def test_ndim(self, test_data): + assert test_data.ndim == 1 + + def test_can_hold_na_valid(self, test_data): + assert test_data._can_hold_na() in {True, False} + + def test_series_constructor(self, test_data): + result = pd.Series(test_data) + assert result.dtype == test_data.dtype + assert len(result) == len(test_data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("from_series", [True, False]) + def test_dataframe_constructor(self, test_data, from_series): + if from_series: + test_data = pd.Series(test_data) + result = pd.DataFrame({"A": test_data}) + assert result.dtypes['A'] == test_data.dtype + assert result.shape == (len(test_data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_concat(self, test_data): + result = pd.concat([ + pd.Series(test_data), + pd.Series(test_data), + ], ignore_index=True) + assert len(result) == len(test_data) * 2 + + def test_iloc(self, test_data): + ser = pd.Series(test_data) + result = ser.iloc[:4] + expected = pd.Series(test_data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_loc(self, test_data): + ser = pd.Series(test_data) + result = ser.loc[:3] + expected = pd.Series(test_data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_repr(self, test_data): + ser = pd.Series(test_data) + assert test_data.dtype.name in repr(ser) + + df = pd.DataFrame({"A": test_data}) + repr(df) + + def test_dtype_name_in_info(self, test_data): + buf = StringIO() + pd.DataFrame({"A": test_data}).info(buf=buf) + result = buf.getvalue() + assert test_data.dtype.name in result + + def test_memory_usage(self, test_data): + s = pd.Series(test_data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_is_extension_array_dtype(self, test_data): + assert is_extension_array_dtype(test_data) + assert is_extension_array_dtype(test_data.dtype) + assert is_extension_array_dtype(pd.Series(test_data)) + assert isinstance(test_data.dtype, ExtensionDtype) + + def test_array_interface(self, test_data): + result = np.array(test_data) + assert result[0] == test_data[0] + + def test_getitem_scalar(self, test_data): + result = test_data[0] + assert isinstance(result, test_data.dtype.type) + + result = pd.Series(test_data)[0] + assert isinstance(result, test_data.dtype.type) + + def test_getitem_scalar_na(self, test_data_missing, na_cmp): + result = test_data_missing[0] + assert na_cmp(result, test_data_missing._fill_value) + + def test_getitem_mask(self, test_data): + # Empty mask, raw array + mask = np.zeros(len(test_data), dtype=bool) + result = test_data[mask] + assert len(result) == 0 + assert isinstance(result, type(test_data)) + + # Empty mask, in series + mask = np.zeros(len(test_data), dtype=bool) + result = pd.Series(test_data)[mask] + assert len(result) == 0 + assert result.dtype == test_data.dtype + + # non-empty mask, raw array + mask[0] = True + result = test_data[mask] + assert len(result) == 1 + assert isinstance(result, type(test_data)) + + # non-empty mask, in series + result = pd.Series(test_data)[mask] + assert len(result) == 1 + assert result.dtype == test_data.dtype + + def test_getitem_slice(self, test_data): + # getitem[slice] should return an array + result = test_data[slice(0)] # empty + assert isinstance(result, type(test_data)) + + result = test_data[slice(1)] # scalar + assert isinstance(result, type(test_data)) + + def test_take_sequence(self, test_data): + result = pd.Series(test_data[[0, 1, 3]]) + assert result.iloc[0] == test_data[0] + assert result.iloc[1] == test_data[1] + assert result.iloc[2] == test_data[3] + + def test_isna(self, test_data_missing): + if test_data_missing._can_hold_na: + expected = np.array([True, False]) + else: + expected = np.array([False, False]) + + result = pd.isna(test_data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(test_data_missing).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("method", [ + "mean", "sum", "prod", "mad", "sem", "var", "std", + "skew", "kurt", "median" + ]) + def test_nuisance_dropped(self, test_data, method): + test_data = test_data[:5] + func = operator.methodcaller(method) + df = pd.DataFrame({"A": np.arange(len(test_data)), + "B": test_data}) + assert len(func(df)) == 1 + + @pytest.mark.parametrize("method", [min, max]) + def test_reduction_orderable(self, test_data, method): + test_data = test_data[:5] + func = operator.methodcaller(method.__name__) + df = pd.DataFrame({"A": np.arange(len(test_data)), + "B": test_data}) + result = func(df) + assert len(result) == 2 + + expected = method(test_data) + assert result['B'] == expected + + @pytest.mark.parametrize("method", ['cummax', 'cummin']) + @pytest.mark.xfail(reason="Assumes comparable to floating.") + def test_cumulative_orderable(self, test_data, method): + # Upcast to object + # https://github.com/pandas-dev/pandas/issues/19296 + # assert result.dtypes['B'] == test_data.dtype + test_data = test_data[:5] + func = operator.methodcaller(method) + df = pd.DataFrame({"A": np.arange(len(test_data)), + "B": test_data}) + result = func(df) + assert result.shape == df.shape + + @pytest.mark.parametrize("binop", [ + operator.add, + operator.sub, + operator.lt, + operator.le, + operator.ge, + operator.gt, + operator.pow, + ], ids=lambda x: x.__name__) + def test_binops(self, test_data, binop): + # Assert that binops work between DataFrames / Series with this type + # if binops work between arrays of this type. Extra tests will be + # needed for, e.g., Array + scalar + test_data = test_data[:5] + df = pd.DataFrame({ + "A": np.arange(len(test_data)), + "B": test_data + }) + + try: + expected = pd.DataFrame({ + "A": binop(df['A'], df['A']), + "B": binop(df['B'].values, df['B'].values), + }) + except Exception: + msg = "{} not supported for {}".format(binop.__name__, + test_data.dtype.name) + raise pytest.skip(msg) + + result = binop(df, df) + tm.assert_frame_equal(result, expected) + + # homogeneous frame + result = binop(df[['B']], df[['B']]) + tm.assert_frame_equal(result, expected[['B']]) + + # series + result = binop(df['B'], df['B']) + tm.assert_series_equal(result, expected['B']) + + def test_as_ndarray(self, test_data): + np.array(test_data, dtype=test_data.dtype.kind) + + def test_align(self, test_data): + a = test_data[:3] + b = test_data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # TODO: assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(test_data)(list(a) + [test_data._fill_value])) + e2 = pd.Series(type(test_data)([test_data._fill_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_index(self, test_data): + result = pd.Series(test_data[:3], index=[0, 1, 2, 3, 4]) + assert result.dtype == test_data.dtype + assert len(result) == 5 + assert len(result.values) == 5 + assert pd.isna(result.loc[[3, 4]]).all() diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 7d01a2a70145d..b47ca3618342d 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -83,3 +83,14 @@ def test_asobject_deprecated(self): with tm.assert_produces_warning(FutureWarning): i = d.asobject assert isinstance(i, pd.Index) + + def test_as_best_array(self): + result = pd.DatetimeIndex(['2017-01-01', + '2017-01-02'])._as_best_array() + expected = np.array(['2017-01-01', '2017-01-02'], dtype='M8[ns]') + tm.assert_numpy_array_equal(result, expected) + + def test_as_best_array_tz(self): + arr = pd.DatetimeIndex(['2017-01-01', '2017-01-02'], tz='US/Central') + result = arr._as_best_array() + tm.assert_index_equal(arr, result) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 974099f1fbbe9..20b61eaf38e81 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2282,6 +2282,11 @@ def test_comparison_tzawareness_compat(self, op): # TODO: implement _assert_tzawareness_compat for the reverse # comparison with the Series on the left-hand side + def test_as_best_array(self): + result = pd.Index([0, 1, 2])._as_best_array() + expected = np.array([0, 1, 2]) + tm.assert_numpy_array_equal(result, expected) + class TestIndexUtils(object): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2e40c79f8914..54dd0851ac2d5 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -1080,3 +1080,8 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') + + def test_as_best_array(self): + result = pd.CategoricalIndex([0, 1, 2])._as_best_array() + expected = pd.Categorical([0, 1, 2]) + tm.assert_categorical_equal(result, expected) From 9cd92c73eb35c4ba38866d77cfabdc1a8341e9dd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 07:00:20 -0600 Subject: [PATCH 002/119] COMPAT: py2 Super --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 364a3b2ae027e..feff9ef24d8bc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1860,7 +1860,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): def __init__(self, values, placement, ndim=None): values = self._maybe_coerce_values(values) - super().__init__(values, placement, ndim) + super(ExtensionBlock, self).__init__(values, placement, ndim) def _maybe_coerce_values(self, values): # Unboxes Series / Index From 9211bbdbde9537b2dffc51697afd0985f8ba2648 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 07:00:33 -0600 Subject: [PATCH 003/119] BUG: Use original object for extension array --- pandas/core/dtypes/missing.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4ed6ddec00289..10c52c857ad12 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, - is_complex_dtype, is_categorical_dtype, + is_complex_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, is_extension_array_dtype, @@ -128,20 +128,15 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - from ..arrays import ExtensionArray - values = getattr(obj, 'values', obj) dtype = values.dtype - if isinstance(values, ExtensionArray): - result = values.isna() + if is_extension_array_dtype(obj): + # work on the original object + result = obj.isna() elif is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isna() - elif is_interval_dtype(values): + if is_interval_dtype(values): + # TODO(IntervalArray): remove this if block from pandas import IntervalIndex result = IntervalIndex(obj).isna() else: From 80f83a6d78652d76955535407cbc410a860e5907 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 13:41:34 -0600 Subject: [PATCH 004/119] Consistent boxing / unboxing NumPy compat --- pandas/core/dtypes/common.py | 3 ++- pandas/core/dtypes/missing.py | 9 +++++++-- pandas/tests/indexes/datetimelike.py | 10 ++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c66e7fcfc6978..2344091f85a88 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1708,9 +1708,10 @@ def is_extension_array_dtype(arr_or_dtype): """ from pandas.core.arrays import ExtensionArray - # we want to unpack series, anything else? if isinstance(arr_or_dtype, ABCSeries): arr_or_dtype = arr_or_dtype._values + elif isinstance(arr_or_dtype, ABCIndexClass): + arr_or_dtype = arr_or_dtype._as_best_array() return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 10c52c857ad12..c7cd97d5ceb87 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -132,8 +132,13 @@ def _isna_ndarraylike(obj): dtype = values.dtype if is_extension_array_dtype(obj): - # work on the original object - result = obj.isna() + if isinstance(obj, ABCIndexClass): + values = obj._as_best_array() + elif isinstance(obj, ABCSeries): + values = obj._values + else: + values = obj + result = values.isna() elif is_string_dtype(dtype): if is_interval_dtype(values): # TODO(IntervalArray): remove this if block diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index b47ca3618342d..64fc1ee8c9680 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -85,12 +85,14 @@ def test_asobject_deprecated(self): assert isinstance(i, pd.Index) def test_as_best_array(self): - result = pd.DatetimeIndex(['2017-01-01', - '2017-01-02'])._as_best_array() - expected = np.array(['2017-01-01', '2017-01-02'], dtype='M8[ns]') + result = pd.DatetimeIndex(['2017-01-01T00:00:00', + '2017-01-02T00:00:00'])._as_best_array() + expected = np.array(['2017-01-01T00:00:00', + '2017-01-02T00:00:00'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) def test_as_best_array_tz(self): - arr = pd.DatetimeIndex(['2017-01-01', '2017-01-02'], tz='US/Central') + arr = pd.DatetimeIndex(['2017-01-01T00:00:00', + '2017-01-02T00:00:00'], tz='US/Central') result = arr._as_best_array() tm.assert_index_equal(arr, result) From ca004d8219a43a7da21a44030be03a78e077194b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 16:03:16 -0600 Subject: [PATCH 005/119] 32-bit compat --- pandas/tests/indexes/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 20b61eaf38e81..7e90260ae62c7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2284,7 +2284,7 @@ def test_comparison_tzawareness_compat(self, op): def test_as_best_array(self): result = pd.Index([0, 1, 2])._as_best_array() - expected = np.array([0, 1, 2]) + expected = np.array([0, 1, 2], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) From 5d4a68617ebafa1f3fdef8c209cf1d55709b0ab6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 06:56:27 -0600 Subject: [PATCH 006/119] Add a test array --- pandas/core/arrays/base.py | 8 +- pandas/core/dtypes/base.py | 12 +- pandas/tests/extension_arrays/base.py | 216 ++++++++++----------- pandas/tests/extension_arrays/test_json.py | 154 +++++++++++++++ 4 files changed, 276 insertions(+), 114 deletions(-) create mode 100644 pandas/tests/extension_arrays/test_json.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8d44c5133f740..f51c3bb12fe45 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,18 +20,20 @@ class ExtensionArray(object): * __getitem__ * __len__ + * __iter__ * dtype * nbytes * isna * take * copy - * _formatting_values + * _concat_same_type Some additional methods are required to satisfy pandas' internal, private block API. - * _concat_same_type * _can_hold_na + * _formatting_values + * _fill_value This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -217,7 +219,7 @@ def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype """An array of values to be printed in, e.g. the Series repr""" - raise np.array(self) + return np.array(self) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c7c5378801f02..2f071a3b3cf71 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,4 +1,6 @@ """Extend pandas with custom array types""" +import inspect + from pandas.errors import AbstractMethodError @@ -106,7 +108,8 @@ def is_dtype(cls, dtype): Parameters ---------- - dtype : str or dtype + dtype : str, object, or type + The dtype to check. Returns ------- @@ -118,12 +121,15 @@ def is_dtype(cls, dtype): 1. ``cls.construct_from_string(dtype)`` is an instance of ``cls``. - 2. 'dtype' is ``cls`` or a subclass of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. 'dtype' is a class and is ``cls`` or a subclass of ``cls``. """ if isinstance(dtype, str): try: return isinstance(cls.construct_from_string(dtype), cls) except TypeError: return False - else: + elif inspect.isclass(dtype): return issubclass(dtype, cls) + else: + return isinstance(dtype, cls) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 7e91ea661721e..2a8a3c314e142 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -49,17 +49,17 @@ class BaseArrayTests(object): Subclasses should implement the following fixtures - * test_data - * test_data_missing + * data + * data_missing """ @pytest.fixture - def test_data(self): + def data(self): """Length-100 array for this type.""" raise NotImplementedError @pytest.fixture - def test_data_missing(self): + def data_missing(self): """Length-2 array with [NA, Valid]""" raise NotImplementedError @@ -74,142 +74,142 @@ def na_cmp(self): """ return operator.is_ - def test_len(self, test_data): - assert len(test_data) == 100 + def test_len(self, data): + assert len(data) == 100 - def test_ndim(self, test_data): - assert test_data.ndim == 1 + def test_ndim(self, data): + assert data.ndim == 1 - def test_can_hold_na_valid(self, test_data): - assert test_data._can_hold_na() in {True, False} + def test_can_hold_na_valid(self, data): + assert data._can_hold_na() in {True, False} - def test_series_constructor(self, test_data): - result = pd.Series(test_data) - assert result.dtype == test_data.dtype - assert len(result) == len(test_data) + def test_series_constructor(self, data): + result = pd.Series(data) + assert result.dtype == data.dtype + assert len(result) == len(data) assert isinstance(result._data.blocks[0], ExtensionBlock) @pytest.mark.parametrize("from_series", [True, False]) - def test_dataframe_constructor(self, test_data, from_series): + def dataframe_constructor(self, data, from_series): if from_series: - test_data = pd.Series(test_data) - result = pd.DataFrame({"A": test_data}) - assert result.dtypes['A'] == test_data.dtype - assert result.shape == (len(test_data), 1) + data = pd.Series(data) + result = pd.DataFrame({"A": data}) + assert result.dtypes['A'] == data.dtype + assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) - def test_concat(self, test_data): + def test_concat(self, data): result = pd.concat([ - pd.Series(test_data), - pd.Series(test_data), + pd.Series(data), + pd.Series(data), ], ignore_index=True) - assert len(result) == len(test_data) * 2 + assert len(result) == len(data) * 2 - def test_iloc(self, test_data): - ser = pd.Series(test_data) + def test_iloc(self, data): + ser = pd.Series(data) result = ser.iloc[:4] - expected = pd.Series(test_data[:4]) + expected = pd.Series(data[:4]) tm.assert_series_equal(result, expected) result = ser.iloc[[0, 1, 2, 3]] tm.assert_series_equal(result, expected) - def test_loc(self, test_data): - ser = pd.Series(test_data) + def test_loc(self, data): + ser = pd.Series(data) result = ser.loc[:3] - expected = pd.Series(test_data[:4]) + expected = pd.Series(data[:4]) tm.assert_series_equal(result, expected) result = ser.loc[[0, 1, 2, 3]] tm.assert_series_equal(result, expected) - def test_repr(self, test_data): - ser = pd.Series(test_data) - assert test_data.dtype.name in repr(ser) + def test_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) - df = pd.DataFrame({"A": test_data}) + df = pd.DataFrame({"A": data}) repr(df) - def test_dtype_name_in_info(self, test_data): + def test_dtype_name_in_info(self, data): buf = StringIO() - pd.DataFrame({"A": test_data}).info(buf=buf) + pd.DataFrame({"A": data}).info(buf=buf) result = buf.getvalue() - assert test_data.dtype.name in result + assert data.dtype.name in result - def test_memory_usage(self, test_data): - s = pd.Series(test_data) + def test_memory_usage(self, data): + s = pd.Series(data) result = s.memory_usage(index=False) assert result == s.nbytes - def test_is_extension_array_dtype(self, test_data): - assert is_extension_array_dtype(test_data) - assert is_extension_array_dtype(test_data.dtype) - assert is_extension_array_dtype(pd.Series(test_data)) - assert isinstance(test_data.dtype, ExtensionDtype) + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) - def test_array_interface(self, test_data): - result = np.array(test_data) - assert result[0] == test_data[0] + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] - def test_getitem_scalar(self, test_data): - result = test_data[0] - assert isinstance(result, test_data.dtype.type) + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) - result = pd.Series(test_data)[0] - assert isinstance(result, test_data.dtype.type) + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) - def test_getitem_scalar_na(self, test_data_missing, na_cmp): - result = test_data_missing[0] - assert na_cmp(result, test_data_missing._fill_value) + def test_getitem_scalar_na(self, data_missing, na_cmp): + result = data_missing[0] + assert na_cmp(result, data_missing._fill_value) - def test_getitem_mask(self, test_data): + def test_getitem_mask(self, data): # Empty mask, raw array - mask = np.zeros(len(test_data), dtype=bool) - result = test_data[mask] + mask = np.zeros(len(data), dtype=bool) + result = data[mask] assert len(result) == 0 - assert isinstance(result, type(test_data)) + assert isinstance(result, type(data)) # Empty mask, in series - mask = np.zeros(len(test_data), dtype=bool) - result = pd.Series(test_data)[mask] + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] assert len(result) == 0 - assert result.dtype == test_data.dtype + assert result.dtype == data.dtype # non-empty mask, raw array mask[0] = True - result = test_data[mask] + result = data[mask] assert len(result) == 1 - assert isinstance(result, type(test_data)) + assert isinstance(result, type(data)) # non-empty mask, in series - result = pd.Series(test_data)[mask] + result = pd.Series(data)[mask] assert len(result) == 1 - assert result.dtype == test_data.dtype + assert result.dtype == data.dtype - def test_getitem_slice(self, test_data): + def test_getitem_slice(self, data): # getitem[slice] should return an array - result = test_data[slice(0)] # empty - assert isinstance(result, type(test_data)) + result = data[slice(0)] # empty + assert isinstance(result, type(data)) - result = test_data[slice(1)] # scalar - assert isinstance(result, type(test_data)) + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) - def test_take_sequence(self, test_data): - result = pd.Series(test_data[[0, 1, 3]]) - assert result.iloc[0] == test_data[0] - assert result.iloc[1] == test_data[1] - assert result.iloc[2] == test_data[3] + def test_take_sequence(self, data): + result = pd.Series(data[[0, 1, 3]]) + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] - def test_isna(self, test_data_missing): - if test_data_missing._can_hold_na: + def test_isna(self, data_missing): + if data_missing._can_hold_na: expected = np.array([True, False]) else: expected = np.array([False, False]) - result = pd.isna(test_data_missing) + result = pd.isna(data_missing) tm.assert_numpy_array_equal(result, expected) - result = pd.Series(test_data_missing).isna() + result = pd.Series(data_missing).isna() expected = pd.Series(expected) tm.assert_series_equal(result, expected) @@ -217,35 +217,35 @@ def test_isna(self, test_data_missing): "mean", "sum", "prod", "mad", "sem", "var", "std", "skew", "kurt", "median" ]) - def test_nuisance_dropped(self, test_data, method): - test_data = test_data[:5] + def test_nuisance_dropped(self, data, method): + data = data[:5] func = operator.methodcaller(method) - df = pd.DataFrame({"A": np.arange(len(test_data)), - "B": test_data}) + df = pd.DataFrame({"A": np.arange(len(data)), + "B": data}) assert len(func(df)) == 1 @pytest.mark.parametrize("method", [min, max]) - def test_reduction_orderable(self, test_data, method): - test_data = test_data[:5] + def test_reduction_orderable(self, data, method): + data = data[:5] func = operator.methodcaller(method.__name__) - df = pd.DataFrame({"A": np.arange(len(test_data)), - "B": test_data}) + df = pd.DataFrame({"A": np.arange(len(data)), + "B": data}) result = func(df) assert len(result) == 2 - expected = method(test_data) + expected = method(data) assert result['B'] == expected @pytest.mark.parametrize("method", ['cummax', 'cummin']) @pytest.mark.xfail(reason="Assumes comparable to floating.") - def test_cumulative_orderable(self, test_data, method): + def test_cumulative_orderable(self, data, method): # Upcast to object # https://github.com/pandas-dev/pandas/issues/19296 - # assert result.dtypes['B'] == test_data.dtype - test_data = test_data[:5] + # assert result.dtypes['B'] == data.dtype + data = data[:5] func = operator.methodcaller(method) - df = pd.DataFrame({"A": np.arange(len(test_data)), - "B": test_data}) + df = pd.DataFrame({"A": np.arange(len(data)), + "B": data}) result = func(df) assert result.shape == df.shape @@ -258,14 +258,14 @@ def test_cumulative_orderable(self, test_data, method): operator.gt, operator.pow, ], ids=lambda x: x.__name__) - def test_binops(self, test_data, binop): + def test_binops(self, data, binop): # Assert that binops work between DataFrames / Series with this type # if binops work between arrays of this type. Extra tests will be # needed for, e.g., Array + scalar - test_data = test_data[:5] + data = data[:5] df = pd.DataFrame({ - "A": np.arange(len(test_data)), - "B": test_data + "A": np.arange(len(data)), + "B": data }) try: @@ -275,7 +275,7 @@ def test_binops(self, test_data, binop): }) except Exception: msg = "{} not supported for {}".format(binop.__name__, - test_data.dtype.name) + data.dtype.name) raise pytest.skip(msg) result = binop(df, df) @@ -289,24 +289,24 @@ def test_binops(self, test_data, binop): result = binop(df['B'], df['B']) tm.assert_series_equal(result, expected['B']) - def test_as_ndarray(self, test_data): - np.array(test_data, dtype=test_data.dtype.kind) + def test_as_ndarray(self, data): + np.array(data, dtype=data.dtype.kind) - def test_align(self, test_data): - a = test_data[:3] - b = test_data[2:5] + def test_align(self, data): + a = data[:3] + b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # TODO: assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(type(test_data)(list(a) + [test_data._fill_value])) - e2 = pd.Series(type(test_data)([test_data._fill_value] + list(b))) + e1 = pd.Series(type(data)(list(a) + [data._fill_value])) + e2 = pd.Series(type(data)([data._fill_value] + list(b))) tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) @pytest.mark.xfail(reason="GH-19342") - def test_series_given_index(self, test_data): - result = pd.Series(test_data[:3], index=[0, 1, 2, 3, 4]) - assert result.dtype == test_data.dtype + def test_series_given_index(self, data): + result = pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + assert result.dtype == data.dtype assert len(result) == 5 assert len(result.values) == 5 assert pd.isna(result.loc[[3, 4]]).all() diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py new file mode 100644 index 0000000000000..7685c7693b122 --- /dev/null +++ b/pandas/tests/extension_arrays/test_json.py @@ -0,0 +1,154 @@ +import collections +import itertools +import numbers +import operator +import random +import string +import sys + +import numpy as np +import pytest + + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.arrays import ExtensionArray + +from .base import BaseArrayTests, BaseDtypeTests + + +class JSONDtype(ExtensionDtype): + type = collections.Mapping + name = 'json' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class JSONArray(ExtensionArray): + dtype = JSONDtype() + + def __init__(self, values): + for val in values: + if not isinstance(val, collections.Mapping): + raise TypeError + self.data = values + + def __getitem__(self, item): + # TDOO: fancy indexing + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + return type(self)([x for x, m in zip(self, item) if m]) + elif isinstance(item, collections.Sequence): + return type(self)([self.data[i] for i in item]) + else: + return type(self)(self.data[item]) + + def __len__(self): + return len(self.data) + + def __iter__(self): + return iter(self.data) + + def __repr__(self): + return 'JSONArary({!r})'.format(self.data) + + @property + def nbytes(self): + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == {} for x in self.data]) + + def take(self, indexer, allow_fill=True, fill_value=None): + output = [self.data[loc] if loc != -1 else {} + for loc in indexer] + return type(self)(output) + + def copy(self, deep=False): + return type(self)(self.data.copy(deep=deep)) + + @property + def _fill_value(self): + return {} + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable([x.data for x in to_concat])) + return cls(data) + + + +def make_data(): + return [{random.choice(string.ascii_letters): random.randint(0, 100) + for _ in range(random.randint(0, 10))} + for _ in range(100)] + + +class TestJSONDtype(BaseDtypeTests): + @pytest.fixture + def dtype(self): + return JSONDtype() + + +class TestJSON(BaseArrayTests): + + @pytest.fixture + def data(self): + """Length-100 PeriodArray for semantics test.""" + return JSONArray(make_data()) + + @pytest.fixture + def data_missing(self): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {'a': 10}]) + + @pytest.fixture + def na_cmp(self): + return operator.eq + + @pytest.mark.skip(reason="Unorderable") + def test_reduction_orderable(self, data, method): + pass + + + +# def test_concat_mixed_closed_raises(): +# one = IntervalArray.from_breaks([0, 1, 2], closed='left') +# two = IntervalArray.from_breaks([1, 2, 3], closed='right') +# +# with tm.assert_raises_regex(ValueError, "Intervals must all be closed"): +# IntervalArray._concat_same_type([one, two]) +# +# +# def test_series_constructor_intervalindex(): +# result = pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2])) +# assert result.dtype == 'interval[int64]' +# +# +# def dataframe_constructor_intervalindex(): +# result = pd.DataFrame({"A": pd.IntervalIndex.from_breaks([0, 1, 2])}) +# assert result.dtypes['A'] == 'interval[int64]' +# +# +# def dataframe_set_intervalarray(): +# df = pd.DataFrame({"A": [1, 2]}) +# arr = IntervalArray.from_breaks([0, 1, 2]) +# df['B'] = arr +# +# expected = pd.DataFrame({"A": [1, 2], "B": arr}) +# tm.assert_frame_equal(df, expected) +# +# +# def dataframe_set_intervalindex(): +# df = pd.DataFrame({"A": [1, 2]}) +# arr = pd.IntervalIndex.from_breaks([0, 1, 2]) +# df['B'] = arr +# +# expected = pd.DataFrame({"A": [1, 2], "B": arr}) +# tm.assert_frame_equal(df, expected) From 9f4ad42734f679d5e9a4bf38fa975e1a176dae12 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 07:00:46 -0600 Subject: [PATCH 007/119] linting --- pandas/tests/extension_arrays/test_json.py | 41 +--------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index 7685c7693b122..3fd9d310a9c1c 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -83,11 +83,9 @@ def _concat_same_type(cls, to_concat): return cls(data) - def make_data(): return [{random.choice(string.ascii_letters): random.randint(0, 100) - for _ in range(random.randint(0, 10))} - for _ in range(100)] + for _ in range(random.randint(0, 10))} for _ in range(100)] class TestJSONDtype(BaseDtypeTests): @@ -115,40 +113,3 @@ def na_cmp(self): @pytest.mark.skip(reason="Unorderable") def test_reduction_orderable(self, data, method): pass - - - -# def test_concat_mixed_closed_raises(): -# one = IntervalArray.from_breaks([0, 1, 2], closed='left') -# two = IntervalArray.from_breaks([1, 2, 3], closed='right') -# -# with tm.assert_raises_regex(ValueError, "Intervals must all be closed"): -# IntervalArray._concat_same_type([one, two]) -# -# -# def test_series_constructor_intervalindex(): -# result = pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2])) -# assert result.dtype == 'interval[int64]' -# -# -# def dataframe_constructor_intervalindex(): -# result = pd.DataFrame({"A": pd.IntervalIndex.from_breaks([0, 1, 2])}) -# assert result.dtypes['A'] == 'interval[int64]' -# -# -# def dataframe_set_intervalarray(): -# df = pd.DataFrame({"A": [1, 2]}) -# arr = IntervalArray.from_breaks([0, 1, 2]) -# df['B'] = arr -# -# expected = pd.DataFrame({"A": [1, 2], "B": arr}) -# tm.assert_frame_equal(df, expected) -# -# -# def dataframe_set_intervalindex(): -# df = pd.DataFrame({"A": [1, 2]}) -# arr = pd.IntervalIndex.from_breaks([0, 1, 2]) -# df['B'] = arr -# -# expected = pd.DataFrame({"A": [1, 2], "B": arr}) -# tm.assert_frame_equal(df, expected) From b1db4e8d39d1193fe54c5b2435fb156b6d899be1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 08:00:41 -0600 Subject: [PATCH 008/119] Default __iter__ --- pandas/core/arrays/base.py | 11 ++++++++++- pandas/tests/extension_arrays/base.py | 2 +- pandas/tests/extension_arrays/test_json.py | 6 ------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f51c3bb12fe45..08625453450f4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,7 +20,6 @@ class ExtensionArray(object): * __getitem__ * __len__ - * __iter__ * dtype * nbytes * isna @@ -105,6 +104,16 @@ def __len__(self): # type: () -> int raise AbstractMethodError(self) + def __iter__(self): + """Iterate over elements. + + This needs to be implemented so that pandas recognizes extension arrays + as list-like. The default implementation makes successive calls to + ``__getitem__``, which may be slower than necessary. + """ + for i in range(len(self)): + yield self[i] + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 2a8a3c314e142..a393b51ff2762 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -195,7 +195,7 @@ def test_getitem_slice(self, data): assert isinstance(result, type(data)) def test_take_sequence(self, data): - result = pd.Series(data[[0, 1, 3]]) + result = pd.Series(data)[[0, 1, 3]] assert result.iloc[0] == data[0] assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index 3fd9d310a9c1c..9b08e8b97f0ce 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -39,22 +39,16 @@ def __init__(self, values): self.data = values def __getitem__(self, item): - # TDOO: fancy indexing if isinstance(item, numbers.Integral): return self.data[item] elif isinstance(item, np.ndarray) and item.dtype == 'bool': return type(self)([x for x, m in zip(self, item) if m]) - elif isinstance(item, collections.Sequence): - return type(self)([self.data[i] for i in item]) else: return type(self)(self.data[item]) def __len__(self): return len(self.data) - def __iter__(self): - return iter(self.data) - def __repr__(self): return 'JSONArary({!r})'.format(self.data) From 00d6bb33174f9c0d2d188e8d787894088bbe4fab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 10:29:14 -0600 Subject: [PATCH 009/119] Tests for value_counts --- pandas/tests/extension_arrays/base.py | 25 +++++++++++++++++++++- pandas/tests/extension_arrays/test_json.py | 10 ++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index a393b51ff2762..529d63215d7ba 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -63,6 +63,13 @@ def data_missing(self): """Length-2 array with [NA, Valid]""" raise NotImplementedError + @pytest.fixture(params=['data', 'data_missing']) + def all_data(self, request, data, data_missing): + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + @pytest.fixture def na_cmp(self): """Binary operator for comparing NA values. @@ -222,7 +229,10 @@ def test_nuisance_dropped(self, data, method): func = operator.methodcaller(method) df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - assert len(func(df)) == 1 + obj = pd.DataFrame({"A": np.arange(len(data)), + "B": np.array(data, dtype=object)}) + + assert len(func(df)) == len(func(obj)) @pytest.mark.parametrize("method", [min, max]) def test_reduction_orderable(self, data, method): @@ -310,3 +320,16 @@ def test_series_given_index(self, data): assert len(result) == 5 assert len(result.values) == 5 assert pd.isna(result.loc[[3, 4]]).all() + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index 9b08e8b97f0ce..f1dc043af6c96 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -34,7 +34,7 @@ class JSONArray(ExtensionArray): def __init__(self, values): for val in values: - if not isinstance(val, collections.Mapping): + if not isinstance(val, self.dtype.type): raise TypeError self.data = values @@ -57,10 +57,10 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == {} for x in self.data]) + return np.array([x == self._fill_value for x in self.data]) def take(self, indexer, allow_fill=True, fill_value=None): - output = [self.data[loc] if loc != -1 else {} + output = [self.data[loc] if loc != -1 else self._fill_value for loc in indexer] return type(self)(output) @@ -107,3 +107,7 @@ def na_cmp(self): @pytest.mark.skip(reason="Unorderable") def test_reduction_orderable(self, data, method): pass + + @pytest.mark.skip(reason="Unhashable") + def test_value_counts(self, all_data, dropna): + pass From 1608e3d3ace66ba4ef066c241a919131c6b0e416 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 10:31:02 -0600 Subject: [PATCH 010/119] Implement value_counts --- pandas/core/algorithms.py | 3 ++- pandas/core/arrays/base.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..427ec5af270bb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,6 +15,7 @@ is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, is_object_dtype, + is_extension_array_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, @@ -542,7 +543,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, else: - if is_categorical_dtype(values) or is_sparse(values): + if is_extension_array_dtype(values) or is_sparse(values): # handle Categorical and sparse, result = Series(values).values.value_counts(dropna=dropna) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 08625453450f4..b4b1c18b5adf7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -254,3 +254,11 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True + + def value_counts(self, dropna=True): + from pandas import value_counts + + if dropna: + self = self[~self.isna()] + + return value_counts(np.array(self)) From 52e21802054d2801ac59003bda87319df04673b2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 10:51:16 -0600 Subject: [PATCH 011/119] Py2 compat --- pandas/tests/extension_arrays/test_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index f1dc043af6c96..3d33953b59056 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -65,7 +65,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): return type(self)(output) def copy(self, deep=False): - return type(self)(self.data.copy(deep=deep)) + return type(self)(self.data[:]) @property def _fill_value(self): From e6d06e2f0eb669e36f5d21645b3fb3e158ba7de7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 15:37:48 -0600 Subject: [PATCH 012/119] Fixed dropna --- pandas/core/arrays/base.py | 1 + pandas/core/internals.py | 5 +++++ pandas/tests/extension_arrays/base.py | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b4b1c18b5adf7..070f6a34b7987 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -245,6 +245,7 @@ def _concat_same_type(cls, to_concat): """ raise AbstractMethodError(cls) + @property def _can_hold_na(self): # type: () -> bool """Whether your array can hold missing values. True by default. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index feff9ef24d8bc..b72048f15575f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1876,6 +1876,11 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def _can_hold_na(self): + # The default ExtensionBlock._can_hold_na is True + return self._holder._can_hold_na + @property def is_view(self): """Extension arrays are never treated as views.""" diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 529d63215d7ba..3e5a429386cfa 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -220,6 +220,11 @@ def test_isna(self, data_missing): expected = pd.Series(expected) tm.assert_series_equal(result, expected) + def test_dropna(self, data_missing): + result = pd.Series(data_missing).dropna() + expected = pd.Series(data_missing).iloc[[1]] + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method", [ "mean", "sum", "prod", "mad", "sem", "var", "std", "skew", "kurt", "median" From d356f191a569b0b3d8bdb15c63e2eb81889a4ecf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Feb 2018 20:45:39 -0600 Subject: [PATCH 013/119] Test fixups --- pandas/core/internals.py | 2 ++ pandas/tests/extension_arrays/base.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b72048f15575f..8d07686988143 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5693,6 +5693,8 @@ def is_na(self): if not values._null_fill_value and values.sp_index.ngaps > 0: return False values_flat = values.ravel(order='K') + elif isinstance(self.block, ExtensionBlock): + values_flat = values else: values_flat = values.ravel(order='K') total_len = values_flat.shape[0] diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 3e5a429386cfa..19f56b1e672dd 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -88,7 +88,7 @@ def test_ndim(self, data): assert data.ndim == 1 def test_can_hold_na_valid(self, data): - assert data._can_hold_na() in {True, False} + assert data._can_hold_na in {True, False} def test_series_constructor(self, data): result = pd.Series(data) From a6ae340b409cb018852fe7d1263f1e1d3742d08d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Feb 2018 06:37:34 -0600 Subject: [PATCH 014/119] Started setitem --- pandas/core/arrays/base.py | 32 +++++++++++++++++++++- pandas/core/internals.py | 5 ++++ pandas/tests/extension_arrays/base.py | 29 ++++++++++++++++++++ pandas/tests/extension_arrays/test_json.py | 12 ++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 070f6a34b7987..5aa478a51efdc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -89,7 +89,37 @@ def __getitem__(self, item): raise AbstractMethodError(self) def __setitem__(self, key, value): - # type: (Any, Any) -> None + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int or ndarray + When called from, e.g. ``Series.__setitem__``, ``key`` will + always be an ndarray of integers. + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + ExtensionArrays may + + Notes + ----- + This method is not required to satisfy the interface. If an + ExtensionArray chooses to implement __setitem__, then some semantics + should be observed. + + * Setting multiple values : ExtensionArrays should support setting + multiple values at once, ``key`` will be a sequence of integers. + + * Broadcasting : For a sequence ``key`` and a scalar ``value``, + each position in ``key`` should be set to ``value``. + + * Coercion : Most users will expect basic coercion to work. For + example, a string like ``'2018-01-01'`` is coerced to a datetime + when setting on a datetime64ns array. In general, if the + ``__init__`` method coerces that value, then so should ``__setitem__``. + + When called from, e.g. ``Series.__setitem__``, ``key`` will always + be an ndarray of positions. + """ raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') ) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8d07686988143..c32a663d51482 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1938,6 +1938,11 @@ def _slice(self, slicer): return self.values[slicer] + def setitem(self, indexer, value, mgr=None): + print(indexer, value) + self.values[indexer] = value + return self + def formatting_values(self): return self.values._formatting_values() diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 19f56b1e672dd..80a807a27f45a 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -207,6 +207,35 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] + def test_setitem_scalar(self, data): + arr = pd.Series(data) + arr[0] = data[1] + assert arr[0] == data[1] + + def test_setitem_sequence(self, data): + arr = pd.Series(data) + original = data.copy() + + arr[[0, 1]] = [data[1], data[0]] + assert arr[0] == original[1] + assert arr[1] == original[0] + + def test_setitem_sequence_broadcasts(self, data): + arr = pd.Series(data) + + arr[[0, 1]] = data[2] + assert arr[0] == data[2] + assert arr[1] == data[2] + + def test_loc_set_scalar(self, data): + arr = pd.Series(data) + arr.loc[0] = data[1] + assert arr[0] == data[1] + + df = pd.DataFrame({"A": data}) + df.loc[0, 'A'] = data[1] + assert df.loc[0, 'A'] == data[1] + def test_isna(self, data_missing): if data_missing._can_hold_na: expected = np.array([True, False]) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index 3d33953b59056..a3e3c119561cc 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -46,6 +46,18 @@ def __getitem__(self, item): else: return type(self)(self.data[item]) + def __setitem__(self, key, value): + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, collections.Sequence): + # broadcast value + value = itertools.cycle([value]) + + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + def __len__(self): return len(self.data) From 41f09d899c4eaa726f0f0f7ffbc55d924a5dcab7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 14:13:57 -0600 Subject: [PATCH 015/119] REF/Clean: Internal / External values --- doc/source/internals.rst | 15 +++++ pandas/core/base.py | 48 +++++++++++--- pandas/core/dtypes/concat.py | 15 +++-- pandas/core/indexes/base.py | 65 ++++++++++++------- pandas/core/indexes/category.py | 25 +++++-- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 9 +++ pandas/core/indexes/multi.py | 38 ++++++----- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/period.py | 42 +++++++----- pandas/core/series.py | 4 +- pandas/io/formats/format.py | 2 +- pandas/io/pytables.py | 2 +- pandas/plotting/_converter.py | 6 +- pandas/tests/indexes/common.py | 6 +- .../tests/indexes/period/test_construction.py | 4 +- pandas/tests/indexes/period/test_period.py | 6 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/test_base.py | 65 ++++++++++++++++++- 19 files changed, 265 insertions(+), 93 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index ee4df879d9478..29aaed318b802 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -89,6 +89,21 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +Values +~~~~~~ + +Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around. +For 1-D containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* and ``ndarray`` +* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or + in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's + always an ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is +the underlying ndarray. + + .. _ref-subclassing-pandas: Subclassing pandas Data Structures diff --git a/pandas/core/base.py b/pandas/core/base.py index d5b204dba063e..52b1f82e8824d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,8 @@ import numpy as np from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex) from pandas.core.dtypes.common import ( is_object_dtype, is_list_like, @@ -706,7 +707,7 @@ def transpose(self, *args, **kwargs): @property def shape(self): """ return a tuple of the shape of the underlying data """ - return self._values.shape + return self._ndarray_values.shape @property def ndim(self): @@ -734,22 +735,22 @@ def data(self): @property def itemsize(self): """ return the size of the dtype of the item of the underlying data """ - return self._values.itemsize + return self._ndarray_values.itemsize @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self._values.nbytes + return self._ndarray_values.nbytes @property def strides(self): """ return the strides of the underlying data """ - return self._values.strides + return self._ndarray_values.strides @property def size(self): """ return the number of elements in the underlying data """ - return self._values.size + return self._ndarray_values.size @property def flags(self): @@ -763,9 +764,34 @@ def base(self): """ return self.values.base + @property + def _ndarray_values(self): + """The data as an ndarray. See '_values' for more.""" + # type: () -> np.ndarray + return self.values + @property def _values(self): - """ the internal implementation """ + # type: () -> Union[ExtensionArray, Index] + # TODO: remove index types as they become is extension arrays + """ The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from '._ndarray_values', which always returns an ndarray. It may differ + from the public '.values' + + index | values | _values + ----------------- | -------------- -| ---------- + CategoricalIndex | Categorical | Categorical + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] + PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) + IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + + See Also + -------- + values + _ndarray_values + """ return self.values @property @@ -816,7 +842,7 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] else: - return self._values.tolist() + return self._ndarray_values.tolist() def __iter__(self): """ @@ -973,8 +999,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) def unique(self): values = self._values - + if isinstance(values, ABCDatetimeIndex): + values = values._ndarray_values + # TODO: Make unique part of the ExtensionArray interface. + # else, this could be surprising. if hasattr(values, 'unique'): + result = values.unique() else: from pandas.core.algorithms import unique1d diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ddecbe85087d8..a49a2680e4daa 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -480,7 +480,7 @@ def _concat_datetimetz(to_concat, name=None): def _concat_index_same_dtype(indexes, klass=None): klass = klass if klass is not None else indexes[0].__class__ - return klass(np.concatenate([x._values for x in indexes])) + return klass(np.concatenate([x._ndarray_values for x in indexes])) def _concat_index_asobject(to_concat, name=None): @@ -498,9 +498,16 @@ def _concat_index_asobject(to_concat, name=None): attribs = self._get_attributes_dict() attribs['name'] = name - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] - return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) + arrays = [] + for x in to_concat: + if is_categorical_dtype(x): + arrays.append(np.asarray(x, dtype=object)) + elif isinstance(x, Index): + arrays.append(x._values) + else: + arrays.append(x) + + return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) def _concat_sparse(to_concat, axis=0, typs=None): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1e1bb0d49b3df..450e0f47ef6ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -392,7 +392,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = np.array(values, copy=False) if is_object_dtype(values): values = cls(values, name=name, dtype=dtype, - **kwargs)._values + **kwargs)._ndarray_values result = object.__new__(cls) result._data = values @@ -644,7 +644,7 @@ def ravel(self, order='C'): -------- numpy.ndarray.ravel """ - return self._values.ravel(order=order) + return self._ndarray_values.ravel(order=order) # construction helpers @classmethod @@ -1577,7 +1577,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._values, len(self)) + return self._engine_type(lambda: self._ndarray_values, len(self)) def _validate_index_level(self, level): """ @@ -2208,27 +2208,37 @@ def union(self, other): other = other.astype('O') return this.union(other) + if is_categorical_dtype(self): + lvals = self.values + else: + lvals = self._ndarray_values + + if is_categorical_dtype(other): + rvals = other.values + else: + rvals = other._ndarray_values + if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self._values, other._values)[0] + result = self._outer_indexer(lvals, rvals)[0] except TypeError: # incomparable objects - result = list(self._values) + result = list(lvals) # worth making this faster? a very unusual case - value_set = set(self._values) - result.extend([x for x in other._values if x not in value_set]) + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) else: indexer = self.get_indexer(other) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(other._values, indexer, + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) - result = _concat._concat_compat((self._values, other_diff)) + result = _concat._concat_compat((lvals, other_diff)) try: - self._values[0] < other_diff[0] + lvals[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -2240,7 +2250,7 @@ def union(self, other): result.sort() else: - result = self._values + result = lvals try: result = np.sort(result) @@ -2293,18 +2303,21 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + result = self._inner_indexer(self._ndarray_values, + other._ndarray_values)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._values).get_indexer(self._values) + indexer = Index(other._ndarray_values).get_indexer( + self._ndarray_values) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates indexer = algos.unique1d( - Index(other._values).get_indexer_non_unique(self._values)[0]) + Index(other._ndarray_values).get_indexer_non_unique( + self._ndarray_values)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2680,7 +2693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise ValueError('limit argument only valid if doing pad, ' 'backfill or nearest reindexing') - indexer = self._engine.get_indexer(target._values) + indexer = self._engine.get_indexer(target._ndarray_values) return _ensure_platform_int(indexer) @@ -2696,12 +2709,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: method = (self._engine.get_pad_indexer if method == 'pad' else self._engine.get_backfill_indexer) - indexer = method(target._values, limit) + indexer = method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._values, indexer, + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, tolerance) return indexer @@ -2792,7 +2806,7 @@ def get_indexer_non_unique(self, target): self = Index(self.asi8) tgt_values = target.asi8 else: - tgt_values = target._values + tgt_values = target._ndarray_values indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return _ensure_platform_int(indexer), missing @@ -3227,16 +3241,17 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._values], - [other._values], how=how, + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, sort=True) left_idx = _ensure_platform_int(left_idx) right_idx = _ensure_platform_int(right_idx) - join_index = np.asarray(self._values.take(left_idx)) + join_index = np.asarray(self._ndarray_values.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._values.take(right_idx)) + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3383,8 +3398,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self._values - ov = other._values + sv = self._ndarray_values + ov = other._ndarray_values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3736,7 +3751,7 @@ def insert(self, loc, item): item = self._na_value _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._values + item = self._coerce_scalar_to_index(item)._ndarray_values idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..5b01f7d2cbe95 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -227,7 +227,7 @@ def _is_dtype_compat(self, other): """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): - other = other._values + other = other.values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") @@ -293,6 +293,23 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def _values(self): + return self._data + + @property + def _ndarray_values(self): + return self._data.codes + + @property + def itemsize(self): + return self.values.itemsize + + @property + def nbytes(self): + """ return the number of bytes in the underlying data """ + return self.values.nbytes + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() @@ -386,8 +403,8 @@ def is_monotonic_decreasing(self): def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = base.IndexOpsMixin.unique(self) - # CategoricalIndex._shallow_copy uses keeps original categories + result = self.values.unique() + # CategoricalIndex._shallow_copy keeps original categories # and ordered if not otherwise specified return self._shallow_copy(result, categories=result.categories, ordered=result.ordered) @@ -762,7 +779,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ - method = getattr(self._values, name) + method = getattr(self.values, name) if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8e77c7a7fa48c..94500a58edd4c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -389,7 +389,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._values) + sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() freq = attribs['freq'] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e09fa87477122..c32d7ce930a7c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -678,6 +678,15 @@ def _assert_tzawareness_compat(self, other): raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects') + @property + def _values(self): + # tz-naive -> ndarray + # tz-aware -> DatetimeIndex + if self.tz is not None: + return self + else: + return self.values + @property def tzinfo(self): """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 510f7245cebd8..1478012aa9dbe 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,9 +799,11 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, lab)) + taken = lev._box_values(algos.take_1d(lev._values, + lab)) elif box: - taken = algos.take_1d(lev._box_values(lev._values), lab, + taken = algos.take_1d(lev._box_values(lev._ndarray_values), + lab, fill_value=_get_na_value(lev.dtype.type)) else: taken = algos.take_1d(np.asarray(lev._values), lab) @@ -1317,7 +1319,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._values + tuples = tuples._ndarray_values arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): @@ -2410,7 +2412,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): mapper = Series(indexer) indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._values + m = result.map(mapper)._ndarray_values else: m = np.zeros(len(labels), dtype=bool) @@ -2569,7 +2571,7 @@ def _update_indexer(idxr, indexer=indexer): else: from .numeric import Int64Index # no matches we are done - return Int64Index([])._values + return Int64Index([])._ndarray_values elif com.is_null_slice(k): # empty slice @@ -2589,8 +2591,8 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: - return Int64Index([])._values - return indexer._values + return Int64Index([])._ndarray_values + return indexer._ndarray_values def truncate(self, before=None, after=None): """ @@ -2639,7 +2641,7 @@ def equals(self, other): if not isinstance(other, MultiIndex): other_vals = com._values_from_object(_ensure_index(other)) - return array_equivalent(self._values, other_vals) + return array_equivalent(self._ndarray_values, other_vals) if self.nlevels != other.nlevels: return False @@ -2650,13 +2652,15 @@ def equals(self, other): for i in range(self.nlevels): slabels = self.labels[i] slabels = slabels[slabels != -1] - svalues = algos.take_nd(np.asarray(self.levels[i]._values), - slabels, allow_fill=False) + svalues = algos.take_nd( + np.asarray(self.levels[i]._values), + slabels, allow_fill=False) olabels = other.labels[i] olabels = olabels[olabels != -1] - ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + ovalues = algos.take_nd( + np.asarray(other.levels[i]._values), + olabels, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -2704,7 +2708,8 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - uniq_tuples = lib.fast_unique_multiple([self._values, other._values]) + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, + other._ndarray_values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -2726,8 +2731,8 @@ def intersection(self, other): if self.equals(other): return self - self_tuples = self._values - other_tuples = other._values + self_tuples = self._ndarray_values + other_tuples = other._ndarray_values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=[[]] * self.nlevels, @@ -2756,7 +2761,8 @@ def difference(self, other): labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) - difference = sorted(set(self._values) - set(other._values)) + difference = sorted(set(self._ndarray_values) - + set(other._ndarray_values)) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b02aee0495d8c..a4558116bfa63 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -378,7 +378,7 @@ def equals(self, other): if (not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape): return False - left, right = self._values, other._values + left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1f8542ed5ee60..c8b7d6063e378 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -54,7 +54,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - result = get_period_field_arr(alias, self._values, base) + result = get_period_field_arr(alias, self._ndarray_values, base) return Index(result, name=self.name) f.__name__ = name f.__doc__ = docstring @@ -82,7 +82,7 @@ def _period_index_cmp(opname, cls, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -94,7 +94,8 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self._values, opname)(other._values) + op = getattr(self._ndarray_values, opname) + result = op(other._ndarray_values) mask = self._isnan | other._isnan if mask.any(): @@ -102,11 +103,11 @@ def wrapper(self, other): return result elif other is tslib.NaT: - result = np.empty(len(self._values), dtype=bool) + result = np.empty(len(self._ndarray_values), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) result = func(other.ordinal) if self.hasnans: @@ -275,11 +276,11 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq - data = data._values + data = data._ndarray_values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, + data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) @@ -374,7 +375,7 @@ def _shallow_copy(self, values=None, freq=None, **kwargs): if freq is None: freq = self.freq if values is None: - values = self._values + values = self._ndarray_values return super(PeriodIndex, self)._shallow_copy(values=values, freq=freq, **kwargs) @@ -407,7 +408,7 @@ def __contains__(self, key): @property def asi8(self): - return self._values.view('i8') + return self._ndarray_values.view('i8') @cache_readonly def _int64index(self): @@ -419,6 +420,12 @@ def values(self): @property def _values(self): + # TODO: return PeriodArray + return self.values + + @property + def _ndarray_values(self): + # Ordinals return self._data def __array__(self, dtype=None): @@ -489,13 +496,15 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self._values[mask].searchsorted(where_idx._values, side='right') + locs = self._ndarray_values[mask].searchsorted( + where_idx._ndarray_values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._values < self._values[first])] = -1 + result[(locs == 0) & (where_idx._ndarray_values < + self._ndarray_values[first])] = -1 return result @@ -523,7 +532,8 @@ def searchsorted(self, value, side='left', sorter=None): elif isinstance(value, compat.string_types): value = Period(value, freq=self.freq).ordinal - return self._values.searchsorted(value, side=side, sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, + sorter=sorter) @property def is_all_dates(self): @@ -664,7 +674,7 @@ def to_timestamp(self, freq=None, how='start'): base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data._values, base) + new_data = period.periodarr_to_dt64arr(new_data._ndarray_values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): @@ -744,7 +754,7 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self._values + n * self.freq.n + values = self._ndarray_values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return self._shallow_copy(values=values) @@ -775,7 +785,7 @@ def get_value(self, series, key): grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._values + vals = self._ndarray_values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -786,7 +796,7 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self._values, [ord1, ord2]) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: diff --git a/pandas/core/series.py b/pandas/core/series.py index e4b8979d6393a..b0ad76d12f1d9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1303,7 +1303,9 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.astype(object).values + # XXX: This surely will have issues around DST boundaries. + result = (DatetimeIndex(result, tz='UTC').tz_convert(self.dtype.tz) + .astype(object).values) return result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 269c81b380b5e..bbeb9e162452d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1884,7 +1884,7 @@ def _format(x): vals = self.values if isinstance(vals, Index): - vals = vals._values + vals = vals._ndarray_values elif isinstance(vals, ABCSparseArray): vals = vals.values diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0d833807602e1..2437b7d396e84 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4430,7 +4430,7 @@ def _convert_index(index, encoding=None, format_type=None): elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._values, 'integer', atom, + return IndexCol(index._ndarray_values, 'integer', atom, freq=getattr(index, 'freq', None), index_name=index_name) diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 07163615c6ba4..9ca06475290e4 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -249,11 +249,11 @@ def _convert_1d(values, units, axis): is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq)._values + return values.asfreq(axis.freq)._ndarray_values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) if is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq)._values + return PeriodIndex(values, freq=axis.freq)._ndarray_values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -642,7 +642,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._values + info['val'][:] = dates_._ndarray_values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8948c5f79900d..2d8d70aa2ac84 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -314,7 +314,8 @@ def test_ensure_copied_data(self): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -323,7 +324,8 @@ def test_ensure_copied_data(self): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') def test_copy_and_deepcopy(self, indices): diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 639a9272c3808..eca80d17b1dc3 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -119,8 +119,8 @@ def test_constructor_fromarraylike(self): tm.assert_index_equal(PeriodIndex(idx.values), idx) tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) - pytest.raises(ValueError, PeriodIndex, idx._values) - pytest.raises(ValueError, PeriodIndex, list(idx._values)) + pytest.raises(ValueError, PeriodIndex, idx._ndarray_values) + pytest.raises(ValueError, PeriodIndex, list(idx._ndarray_values)) pytest.raises(TypeError, PeriodIndex, data=Period('2007', freq='A')) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6fc7fa5486f82..e3b1256fa0584 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -205,7 +205,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') @@ -213,7 +213,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') @@ -222,7 +222,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 0e72cadb5d494..f5a62371ae799 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -22,7 +22,7 @@ class TestPeriodRepresentation(object): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - tm.assert_numpy_array_equal(rng._values, exp) + tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index df2547fc7b0da..5a67aa3f989ae 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -338,8 +338,9 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o._values, op)(), - freq=o.freq) + expected = pd.Period( + ordinal=getattr(o._ndarray_values, op)(), + freq=o.freq) try: assert result == expected except TypeError: @@ -450,7 +451,7 @@ def test_value_counts_unique_nunique_null(self): for orig in self.objs: o = orig.copy() klass = type(o) - values = o._values + values = o._ndarray_values if not self._allow_na_ops(o): continue @@ -1175,3 +1176,61 @@ def test_iter_box(self): assert isinstance(res, pd.Period) assert res.freq == 'M' assert res == exp + + +@pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), +]) +def test_unique_datetime_index(arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('arr, expected', [ + (pd.Series(pd.DatetimeIndex(['2017', '2017'])), + np.array(['2017'], dtype='M8[ns]')), + (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), + np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), +]) +def test_unique_datetime_series(arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('array, expected_type', [ + (np.array([0, 1]), np.ndarray), + (np.array(['a', 'b']), np.ndarray), + (pd.Categorical(['a', 'b']), pd.Categorical), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex), +]) +def test_values_consistent(array, expected_type): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + if isinstance(l_values, np.ndarray): + tm.assert_numpy_array_equal(l_values, r_values) + elif isinstance(l_values, pd.Index): + tm.assert_index_equal(l_values, r_values) + elif pd.api.types.is_categorical(l_values): + tm.assert_categorical_equal(l_values, r_values) + else: + raise TypeError("Unexpected type {}".format(type(l_values))) From 29cfd7c22dd0b5b67c44144f1520f0bce8bf0e74 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Feb 2018 14:34:22 -0600 Subject: [PATCH 016/119] Move to index base --- pandas/core/base.py | 24 ------------------------ pandas/core/indexes/base.py | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 52b1f82e8824d..ab4c969810c93 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -770,30 +770,6 @@ def _ndarray_values(self): # type: () -> np.ndarray return self.values - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index] - # TODO: remove index types as they become is extension arrays - """ The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from '._ndarray_values', which always returns an ndarray. It may differ - from the public '.values' - - index | values | _values - ----------------- | -------------- -| ---------- - CategoricalIndex | Categorical | Categorical - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] - PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) - IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) - - See Also - -------- - values - _ndarray_values - """ - return self.values - @property def empty(self): return not self.size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 450e0f47ef6ff..d84c4dcb58f83 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -574,6 +574,30 @@ def values(self): """ return the underlying data as an ndarray """ return self._data.view(np.ndarray) + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index] + # TODO: remove index types as they become is extension arrays + """The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from '._ndarray_values', which always returns an ndarray. It may differ + from the public '.values' + + index | values | _values + ----------------- | -------------- -| ---------- + CategoricalIndex | Categorical | Categorical + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] + PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) + IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + + See Also + -------- + values + _ndarray_values + """ + return self.values + def get_values(self): """ return the underlying data as an ndarray """ return self.values From 82fd0c6ae185755e2e7f8c06b31155d4c2cefbf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 06:14:59 -0600 Subject: [PATCH 017/119] Setitem tests, decimal example --- pandas/core/internals.py | 4 +- pandas/tests/extension_arrays/base.py | 186 ++++++++++++------ pandas/tests/extension_arrays/test_decimal.py | 147 ++++++++++++++ pandas/tests/extension_arrays/test_json.py | 29 +++ 4 files changed, 304 insertions(+), 62 deletions(-) create mode 100644 pandas/tests/extension_arrays/test_decimal.py diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c32a663d51482..ac490f946e5ee 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1939,7 +1939,9 @@ def _slice(self, slicer): return self.values[slicer] def setitem(self, indexer, value, mgr=None): - print(indexer, value) + print(indexer) + if isinstance(indexer, tuple): + indexer = indexer[0] self.values[indexer] = value return self diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 80a807a27f45a..bfbd0d96c2615 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -53,6 +53,9 @@ class BaseArrayTests(object): * data_missing """ + # ------------------------------------------------------------------------ + # Fixtures + # ------------------------------------------------------------------------ @pytest.fixture def data(self): """Length-100 array for this type.""" @@ -81,6 +84,10 @@ def na_cmp(self): """ return operator.is_ + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + def test_len(self, data): assert len(data) == 100 @@ -90,6 +97,35 @@ def test_ndim(self, data): def test_can_hold_na_valid(self, data): assert data._can_hold_na in {True, False} + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + def test_as_ndarray_with_dtype_kind(self, data): + np.array(data, dtype=data.dtype.kind) + + def test_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result + + # ------------------------------------------------------------------------ + # Constructors + # ------------------------------------------------------------------------ + def test_series_constructor(self, data): result = pd.Series(data) assert result.dtype == data.dtype @@ -105,6 +141,18 @@ def dataframe_constructor(self, data, from_series): assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_index(self, data): + result = pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + assert result.dtype == data.dtype + assert len(result) == 5 + assert len(result.values) == 5 + assert pd.isna(result.loc[[3, 4]]).all() + + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + def test_concat(self, data): result = pd.concat([ pd.Series(data), @@ -112,6 +160,10 @@ def test_concat(self, data): ], ignore_index=True) assert len(result) == len(data) * 2 + # ------------------------------------------------------------------------ + # Indexing - getting + # ------------------------------------------------------------------------ + def test_iloc(self, data): ser = pd.Series(data) result = ser.iloc[:4] @@ -130,34 +182,12 @@ def test_loc(self, data): result = ser.loc[[0, 1, 2, 3]] tm.assert_series_equal(result, expected) - def test_repr(self, data): - ser = pd.Series(data) - assert data.dtype.name in repr(ser) - - df = pd.DataFrame({"A": data}) - repr(df) - - def test_dtype_name_in_info(self, data): - buf = StringIO() - pd.DataFrame({"A": data}).info(buf=buf) - result = buf.getvalue() - assert data.dtype.name in result - - def test_memory_usage(self, data): - s = pd.Series(data) - result = s.memory_usage(index=False) - assert result == s.nbytes - def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) assert is_extension_array_dtype(pd.Series(data)) assert isinstance(data.dtype, ExtensionDtype) - def test_array_interface(self, data): - result = np.array(data) - assert result[0] == data[0] - def test_getitem_scalar(self, data): result = data[0] assert isinstance(result, data.dtype.type) @@ -207,6 +237,10 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] + # ------------------------------------------------------------------------ + # Indexing - Setting + # ------------------------------------------------------------------------ + def test_setitem_scalar(self, data): arr = pd.Series(data) arr[0] = data[1] @@ -227,14 +261,51 @@ def test_setitem_sequence_broadcasts(self, data): assert arr[0] == data[2] assert arr[1] == data[2] - def test_loc_set_scalar(self, data): + @pytest.mark.parametrize('setter', ['loc', 'iloc']) + def test_set_scalar(self, data, setter): arr = pd.Series(data) - arr.loc[0] = data[1] + setter = getattr(arr, setter) + operator.setitem(setter, 0, data[1]) assert arr[0] == data[1] + def test_set_loc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.loc[0, 'B'] = data[1] + assert df.loc[0, 'B'] == data[1] + + def test_set_loc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.loc[10, 'B'] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_set_loc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.loc[10, 'B'] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_set_iloc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.iloc[0, 1] = data[1] + assert df.loc[0, 'B'] == data[1] + + def test_set_iloc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.iloc[10, 0] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_set_iloc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.iloc[10, 1] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) - df.loc[0, 'A'] = data[1] - assert df.loc[0, 'A'] == data[1] + df['B'] = 1 + assert len(df.columns) == 2 + + # ------------------------------------------------------------------------ + # Methods + # ------------------------------------------------------------------------ def test_isna(self, data_missing): if data_missing._can_hold_na: @@ -254,6 +325,34 @@ def test_dropna(self, data_missing): expected = pd.Series(data_missing).iloc[[1]] tm.assert_series_equal(result, expected) + def test_align(self, data): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # TODO: assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [data._fill_value])) + e2 = pd.Series(type(data)([data._fill_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------------------ + # Ops + # ------------------------------------------------------------------------ + @pytest.mark.parametrize("method", [ "mean", "sum", "prod", "mad", "sem", "var", "std", "skew", "kurt", "median" @@ -332,38 +431,3 @@ def test_binops(self, data, binop): # series result = binop(df['B'], df['B']) tm.assert_series_equal(result, expected['B']) - - def test_as_ndarray(self, data): - np.array(data, dtype=data.dtype.kind) - - def test_align(self, data): - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - - # TODO: assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(type(data)(list(a) + [data._fill_value])) - e2 = pd.Series(type(data)([data._fill_value] + list(b))) - tm.assert_series_equal(r1, e1) - tm.assert_series_equal(r2, e2) - - @pytest.mark.xfail(reason="GH-19342") - def test_series_given_index(self, data): - result = pd.Series(data[:3], index=[0, 1, 2, 3, 4]) - assert result.dtype == data.dtype - assert len(result) == 5 - assert len(result.values) == 5 - assert pd.isna(result.loc[[3, 4]]).all() - - @pytest.mark.parametrize('dropna', [True, False]) - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension_arrays/test_decimal.py b/pandas/tests/extension_arrays/test_decimal.py new file mode 100644 index 0000000000000..f6b281516e115 --- /dev/null +++ b/pandas/tests/extension_arrays/test_decimal.py @@ -0,0 +1,147 @@ +import decimal +import numbers +import random +import sys + +import numpy as np +import pandas as pd +import pandas.util.testing as tm +import pytest + +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype + +from .base import BaseDtypeTests, BaseArrayTests + + +class DecimalDtype(ExtensionDtype): + type = decimal.Decimal + name = 'decimal' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class DecimalArray(ExtensionArray): + dtype = DecimalDtype() + + def __init__(self, values): + values = np.asarray(values, dtype=object) + + self.values = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.values[item] + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + return type(self)([x for x, m in zip(self, item) if m]) + else: + return type(self)(self.values[item]) + + def copy(self, deep=False): + if deep: + return type(self)(self.values.copy()) + return type(self)(self) + + def __setitem__(self, key, value): + if pd.api.types.is_list_like(value): + value = [decimal.Decimal(v) for v in value] + else: + value = decimal.Decimal(value) + self.values[key] = value + + def __len__(self): + return len(self.values) + + def __repr__(self): + return repr(self.values) + + @property + def nbytes(self): + n = len(self) + if n: + return n * sys.getsizeof(self[0]) + return 0 + + def isna(self): + return np.array([x.is_nan() for x in self.values]) + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + + out = self.values.take(indexer) + out[mask] = self._fill_value + + return type(self)(out) + + @property + def _fill_value(self): + return decimal.Decimal('NaN') + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([x.values for x in to_concat])) + + +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] + + +class TestDecimalDtype(BaseDtypeTests): + + @pytest.fixture + def dtype(self): + return DecimalDtype() + + +class TestDecimalArray(BaseArrayTests): + + @pytest.fixture + def data(self): + return DecimalArray(make_data()) + + @pytest.fixture + def data_missing(self): + return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) + + @pytest.fixture + def na_cmp(self): + return lambda x, y: x.is_nan() and y.is_nan() + + @pytest.mark.skip(reason="Who knows.") + def test_repr(self, data): + super().test_repr(data) + + def test_align(self, data): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # NaN handling + e1 = pd.Series(type(data)(list(a) + [data._fill_value])) + e2 = pd.Series(type(data)([data._fill_value] + list(b))) + tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1[3].is_nan() + assert e1[3].is_nan() + + tm.assert_series_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2[0].is_nan() + assert e2[0].is_nan() + + @pytest.mark.skip(reason="NaN Sorting") + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index a3e3c119561cc..e48209c143fd0 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -116,6 +116,7 @@ def data_missing(self): def na_cmp(self): return operator.eq + # Having trouble setting a sized object like {'a': 1} into a scalar slot @pytest.mark.skip(reason="Unorderable") def test_reduction_orderable(self, data, method): pass @@ -123,3 +124,31 @@ def test_reduction_orderable(self, data, method): @pytest.mark.skip(reason="Unhashable") def test_value_counts(self, all_data, dropna): pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_scalar(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_loc_scalar_mixed(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_loc_scalar_single(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_loc_scalar_multiple_homogoneous(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_iloc_scalar_mixed(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_iloc_scalar_single(self): + pass + + @pytest.mark.xfail(reason="Difficulty setting sized objects.") + def test_set_iloc_scalar_multiple_homogoneous(self): + pass From 8b1e7d61bd8411b641c63b27f8e444b8b49dc51b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 06:30:33 -0600 Subject: [PATCH 018/119] Compat --- pandas/core/internals.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ac490f946e5ee..21457d10303a8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1879,7 +1879,9 @@ def _holder(self): @property def _can_hold_na(self): # The default ExtensionBlock._can_hold_na is True - return self._holder._can_hold_na + # Needed getattr to pass our old extension tests + # Check if geopandas needs this. + return getattr(self._holder, '_can_hold_na', True) @property def is_view(self): From 10af4b6b934234a23c2ba35fce02e3af23546a2c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 08:09:24 -0600 Subject: [PATCH 019/119] Fixed extension block tests. The only "API change" was that you can't just inherit from NonConsolidatableMixin, which is OK since 1. it's a mixin 2. geopandas also inherits from Block --- pandas/core/internals.py | 4 +--- pandas/tests/internals/test_external_block.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 21457d10303a8..ac490f946e5ee 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1879,9 +1879,7 @@ def _holder(self): @property def _can_hold_na(self): # The default ExtensionBlock._can_hold_na is True - # Needed getattr to pass our old extension tests - # Check if geopandas needs this. - return getattr(self._holder, '_can_hold_na', True) + return self._holder._can_hold_na @property def is_view(self): diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index 2487363df8f99..991da41168aa0 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - BlockManager, SingleBlockManager, ExtensionBlock) + BlockManager, SingleBlockManager, NonConsolidatableMixIn, Block) import pytest -class CustomBlock(ExtensionBlock): +class CustomBlock(NonConsolidatableMixIn, Block): _holder = np.ndarray From cd5f1eb37d8fc46b959d032becb21789f897bbdd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 08:44:41 -0600 Subject: [PATCH 020/119] Clarify binop tests Make it clearer which bit might raise --- pandas/tests/extension_arrays/base.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index bfbd0d96c2615..269d4d7760930 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -412,14 +412,16 @@ def test_binops(self, data, binop): }) try: - expected = pd.DataFrame({ - "A": binop(df['A'], df['A']), - "B": binop(df['B'].values, df['B'].values), - }) - except Exception: + expected_array = binop(data, data) + except TypeError: msg = "{} not supported for {}".format(binop.__name__, data.dtype.name) - raise pytest.skip(msg) + pytest.skip(msg) + + expected = pd.DataFrame({ + "A": binop(df['A'], df['A']), + "B": expected_array, + }) result = binop(df, df) tm.assert_frame_equal(result, expected) From 0a9d9fd94988245317852c9712cb856b85f85f36 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 09:25:58 -0600 Subject: [PATCH 021/119] TST: Removed ops tests --- pandas/core/internals.py | 1 - pandas/tests/extension_arrays/base.py | 85 ------------------- pandas/tests/extension_arrays/test_decimal.py | 4 - pandas/tests/extension_arrays/test_json.py | 5 -- 4 files changed, 95 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ac490f946e5ee..2f61da7f719c0 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1939,7 +1939,6 @@ def _slice(self, slicer): return self.values[slicer] def setitem(self, indexer, value, mgr=None): - print(indexer) if isinstance(indexer, tuple): indexer = indexer[0] self.values[indexer] = value diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 269d4d7760930..b4362b6795737 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -348,88 +348,3 @@ def test_value_counts(self, all_data, dropna): expected = pd.Series(other).value_counts(dropna=dropna).sort_index() tm.assert_series_equal(result, expected) - - # ------------------------------------------------------------------------ - # Ops - # ------------------------------------------------------------------------ - - @pytest.mark.parametrize("method", [ - "mean", "sum", "prod", "mad", "sem", "var", "std", - "skew", "kurt", "median" - ]) - def test_nuisance_dropped(self, data, method): - data = data[:5] - func = operator.methodcaller(method) - df = pd.DataFrame({"A": np.arange(len(data)), - "B": data}) - obj = pd.DataFrame({"A": np.arange(len(data)), - "B": np.array(data, dtype=object)}) - - assert len(func(df)) == len(func(obj)) - - @pytest.mark.parametrize("method", [min, max]) - def test_reduction_orderable(self, data, method): - data = data[:5] - func = operator.methodcaller(method.__name__) - df = pd.DataFrame({"A": np.arange(len(data)), - "B": data}) - result = func(df) - assert len(result) == 2 - - expected = method(data) - assert result['B'] == expected - - @pytest.mark.parametrize("method", ['cummax', 'cummin']) - @pytest.mark.xfail(reason="Assumes comparable to floating.") - def test_cumulative_orderable(self, data, method): - # Upcast to object - # https://github.com/pandas-dev/pandas/issues/19296 - # assert result.dtypes['B'] == data.dtype - data = data[:5] - func = operator.methodcaller(method) - df = pd.DataFrame({"A": np.arange(len(data)), - "B": data}) - result = func(df) - assert result.shape == df.shape - - @pytest.mark.parametrize("binop", [ - operator.add, - operator.sub, - operator.lt, - operator.le, - operator.ge, - operator.gt, - operator.pow, - ], ids=lambda x: x.__name__) - def test_binops(self, data, binop): - # Assert that binops work between DataFrames / Series with this type - # if binops work between arrays of this type. Extra tests will be - # needed for, e.g., Array + scalar - data = data[:5] - df = pd.DataFrame({ - "A": np.arange(len(data)), - "B": data - }) - - try: - expected_array = binop(data, data) - except TypeError: - msg = "{} not supported for {}".format(binop.__name__, - data.dtype.name) - pytest.skip(msg) - - expected = pd.DataFrame({ - "A": binop(df['A'], df['A']), - "B": expected_array, - }) - - result = binop(df, df) - tm.assert_frame_equal(result, expected) - - # homogeneous frame - result = binop(df[['B']], df[['B']]) - tm.assert_frame_equal(result, expected[['B']]) - - # series - result = binop(df['B'], df['B']) - tm.assert_series_equal(result, expected['B']) diff --git a/pandas/tests/extension_arrays/test_decimal.py b/pandas/tests/extension_arrays/test_decimal.py index f6b281516e115..687e645825a75 100644 --- a/pandas/tests/extension_arrays/test_decimal.py +++ b/pandas/tests/extension_arrays/test_decimal.py @@ -113,10 +113,6 @@ def data_missing(self): def na_cmp(self): return lambda x, y: x.is_nan() and y.is_nan() - @pytest.mark.skip(reason="Who knows.") - def test_repr(self, data): - super().test_repr(data) - def test_align(self, data): a = data[:3] b = data[2:5] diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index e48209c143fd0..343168a63da60 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -116,11 +116,6 @@ def data_missing(self): def na_cmp(self): return operator.eq - # Having trouble setting a sized object like {'a': 1} into a scalar slot - @pytest.mark.skip(reason="Unorderable") - def test_reduction_orderable(self, data, method): - pass - @pytest.mark.skip(reason="Unhashable") def test_value_counts(self, all_data, dropna): pass From 3185f4e08fdde6736a02edb52da2647cae8d599c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 13:17:40 -0600 Subject: [PATCH 022/119] Cleanup unique handling --- pandas/core/base.py | 4 +--- pandas/core/indexes/datetimes.py | 12 ++++++++++++ pandas/core/series.py | 4 +--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index ab4c969810c93..7a8b5f9b608c7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -975,10 +975,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) def unique(self): values = self._values - if isinstance(values, ABCDatetimeIndex): - values = values._ndarray_values + # TODO: Make unique part of the ExtensionArray interface. - # else, this could be surprising. if hasattr(values, 'unique'): result = values.unique() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c32d7ce930a7c..d749f8aec50cd 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1095,6 +1095,18 @@ def snap(self, freq='S'): # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + def unique(self, level=None): + # Override here since IndexOpsMixin.unique uses self._values.unique + # For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error + # So we extract the tz-naive DatetimeIndex, unique that, and wrap the + # result with out TZ. + if self.tz is not None: + naive = type(self)(self._ndarray_values, copy=False) + else: + naive = self + result = super(DatetimeIndex, naive).unique(level=level) + return self._simple_new(result, name=self.name, tz=self.tz, freq=self.freq) + def union(self, other): """ Specialized union for DatetimeIndex objects. If combine diff --git a/pandas/core/series.py b/pandas/core/series.py index b0ad76d12f1d9..e4b8979d6393a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1303,9 +1303,7 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - # XXX: This surely will have issues around DST boundaries. - result = (DatetimeIndex(result, tz='UTC').tz_convert(self.dtype.tz) - .astype(object).values) + result = result.astype(object).values return result From 476f75d3b8cf07fb9965a1fa96dcdf932a01bde8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 14:29:02 -0600 Subject: [PATCH 023/119] Simplify object concat --- pandas/core/dtypes/concat.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a49a2680e4daa..d6b55d03ebccd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -493,20 +493,11 @@ def _concat_index_asobject(to_concat, name=None): to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] - from pandas import Index self = to_concat[0] attribs = self._get_attributes_dict() attribs['name'] = name - arrays = [] - for x in to_concat: - if is_categorical_dtype(x): - arrays.append(np.asarray(x, dtype=object)) - elif isinstance(x, Index): - arrays.append(x._values) - else: - arrays.append(x) - + arrays = [np.array(x, copy=False, dtype=object) for x in to_concat] return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) From b15ee5a000003e42bf65389308c7277b6461fd05 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 14:38:58 -0600 Subject: [PATCH 024/119] Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dd4c8ac2e86a3..70c0c822fb5e8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2347,8 +2347,7 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._ndarray_values, - other._ndarray_values)[0] + result = self._inner_indexer(self._values, other._values)[0] return self._wrap_union_result(other, result) except TypeError: pass From 659073f8a67e513267048d467da715c60d885c51 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 15:14:32 -0600 Subject: [PATCH 025/119] hmm --- pandas/core/indexes/base.py | 22 +++++++++++++++++++++- pandas/core/indexes/category.py | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 70c0c822fb5e8..260016661a735 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2310,6 +2310,24 @@ def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None return self.__class__(result, name=name) + def _ensure_join(self, values): + """Ensure that the 'values' are ready for our join indexer. + + The default join indexers are object, so this just returns 'values'. + This is called before calling those. + + + Parameters + ---------- + values : array-like + + Returns + ------- + values : ndarray + Expected to have the correct type for self.inner_indexer + """ + return values + def intersection(self, other): """ Form the intersection of two Index objects. @@ -2347,7 +2365,9 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + lvals = self._ensure_join(self._ndarray_values) + rvals = self._ensure_join(other._ndarray_values) + result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5b01f7d2cbe95..48cdd28911487 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,6 @@ import numpy as np from pandas._libs import index as libindex +from pandas._libs import join as libjoin from pandas import compat from pandas.compat.numpy import function as nv @@ -8,6 +9,8 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, _ensure_platform_int, + _ensure_int32, + _ensure_int64, is_list_like, is_interval_dtype, is_scalar) @@ -214,6 +217,14 @@ def _shallow_copy(self, values=None, categories=None, ordered=None, values=values, categories=categories, ordered=ordered, **kwargs) + @cache_readonly + def _inner_indexer(self): + if self.codes.dtype.itemsize <= 4: + # int8, int16, int32 + return libjoin.inner_join_indexer_int32 + else: + return libjoin.inner_join_indexer_int64 + def _is_dtype_compat(self, other): """ *this is an internal non-public method* @@ -787,6 +798,12 @@ def _delegate_method(self, name, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) + def _ensure_join(self, values): + if self.codes.dtype.itemsize <= 4: + return _ensure_int32(values) + else: + return _ensure_int64(values) + @classmethod def _add_accessors(cls): """ add in Categorical accessor methods """ From b15ecac9a4aaaa0b7fbc4d4df0644affc25a5f20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 16:52:38 -0600 Subject: [PATCH 026/119] More failing tests --- pandas/tests/extension_arrays/base.py | 69 ++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index b4362b6795737..d0f82db3a4d14 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -131,6 +131,7 @@ def test_series_constructor(self, data): assert result.dtype == data.dtype assert len(result) == len(data) assert isinstance(result._data.blocks[0], ExtensionBlock) + assert result._data.blocks[0].values is data @pytest.mark.parametrize("from_series", [True, False]) def dataframe_constructor(self, data, from_series): @@ -159,12 +160,14 @@ def test_concat(self, data): pd.Series(data), ], ignore_index=True) assert len(result) == len(data) * 2 + assert result.dtype == data.dtype + assert isinstance(result._data.blocks[0], ExtensionBlock) # ------------------------------------------------------------------------ # Indexing - getting # ------------------------------------------------------------------------ - def test_iloc(self, data): + def test_iloc_series(self, data): ser = pd.Series(data) result = ser.iloc[:4] expected = pd.Series(data[:4]) @@ -173,7 +176,29 @@ def test_iloc(self, data): result = ser.iloc[[0, 1, 2, 3]] tm.assert_series_equal(result, expected) - def test_loc(self, data): + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + def test_loc_series(self, data): ser = pd.Series(data) result = ser.loc[:3] expected = pd.Series(data[:4]) @@ -182,6 +207,28 @@ def test_loc(self, data): result = ser.loc[[0, 1, 2, 3]] tm.assert_series_equal(result, expected) + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ['A']] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ['A']] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) @@ -298,6 +345,24 @@ def test_set_iloc_scalar_multiple_homogoneous(self, data): df.iloc[10, 1] = data[1] assert df.loc[10, 'B'] == data[1] + def test_set_mask_aligned(self, data): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + ser[mask] = data[5:7] + assert ser[0] == data[5] + assert ser[1] == data[6] + + def test_set_mask_broadcast(self, data): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + ser[mask] = data[10] + assert ser[0] == data[10] + assert ser[1] == data[10] + def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) df['B'] = 1 From 88b8f4fea5b588b7fb1c76abd3b599e100b4a8c3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 16:54:17 -0600 Subject: [PATCH 027/119] remove bad test --- pandas/tests/extension_arrays/base.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index d0f82db3a4d14..939fb6fd05bc2 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -142,14 +142,6 @@ def dataframe_constructor(self, data, from_series): assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) - @pytest.mark.xfail(reason="GH-19342") - def test_series_given_index(self, data): - result = pd.Series(data[:3], index=[0, 1, 2, 3, 4]) - assert result.dtype == data.dtype - assert len(result) == 5 - assert len(result.values) == 5 - assert pd.isna(result.loc[[3, 4]]).all() - # ------------------------------------------------------------------------ # Reshaping # ------------------------------------------------------------------------ From 349ac1ab06bfb0f51793a75f9270737139001a4f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 21:45:07 -0600 Subject: [PATCH 028/119] better setitem --- pandas/core/indexing.py | 2 + pandas/core/internals.py | 5 +- pandas/tests/extension_arrays/test_json.py | 54 +++++++++++++++------- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9463512ac11de..8f0f88a24552d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,6 +618,8 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): + # TODO (maybe) this causes issues with setting for + # extensionarrays that store dicts. value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2f61da7f719c0..4d328bc8af189 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,6 +15,7 @@ from pandas.core.base import PandasObject +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( ExtensionDtype, DatetimeTZDtype, CategoricalDtype) @@ -3481,7 +3482,9 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, aligned_args = dict((k, kwargs[k]) for k in align_keys - if hasattr(kwargs[k], 'values')) + if hasattr(kwargs[k], 'values') + # eww + and not isinstance(kwargs[k], ExtensionArray)) for b in self.blocks: if filter is not None: diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index 343168a63da60..b4c23de6852b1 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -50,13 +50,21 @@ def __setitem__(self, key, value): if isinstance(key, numbers.Integral): self.data[key] = value else: - if not isinstance(value, collections.Sequence): + if not isinstance(value, (type(self), + collections.Sequence)): # broadcast value value = itertools.cycle([value]) - for k, v in zip(key, value): - assert isinstance(v, self.dtype.type) - self.data[k] = v + if isinstance(key, np.ndarray) and key.dtype == 'bool': + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v def __len__(self): return len(self.data) @@ -90,8 +98,10 @@ def _concat_same_type(cls, to_concat): def make_data(): - return [{random.choice(string.ascii_letters): random.randint(0, 100) - for _ in range(random.randint(0, 10))} for _ in range(100)] + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + return [collections.UserDict([ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10))]) for _ in range(100)] class TestJSONDtype(BaseDtypeTests): @@ -120,30 +130,42 @@ def na_cmp(self): def test_value_counts(self, all_data, dropna): pass - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_scalar(self): - pass + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") + # def test_set_scalar(self): + # pass + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") def test_set_loc_scalar_mixed(self): + # This fails on an np.ndarary(dict) call in _setitem_with_indexer pass - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_loc_scalar_single(self): - pass + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") + # def test_set_loc_scalar_single(self): + # pass + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") def test_set_loc_scalar_multiple_homogoneous(self): + # This fails in _setitem_with_indexer with a + # ValueError: Must have equal len keys and value when setting with + # and iterable pass @pytest.mark.xfail(reason="Difficulty setting sized objects.") def test_set_iloc_scalar_mixed(self): + # This fails in _setitem_with_indexer with a + # ValueError: Must have equal len keys and value when setting with an + # iterable pass - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_iloc_scalar_single(self): - pass - + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") + # def test_set_iloc_scalar_single(self): + # pass + # @pytest.mark.xfail(reason="Difficulty setting sized objects.") def test_set_iloc_scalar_multiple_homogoneous(self): + # this fails in _setitem_with_indexer with a + # ValueError: Must have equal len keys and value when setting with an + # iterable pass From 27ab045e3d83871a9b28d532c8d44b0adc238fff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 22:15:01 -0600 Subject: [PATCH 029/119] Dropna works. --- pandas/core/frame.py | 2 +- pandas/core/internals.py | 7 ++++++ pandas/tests/extension_arrays/base.py | 32 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec0b805b590fe..f565154d5c678 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5622,7 +5622,7 @@ def count(self, axis=0, level=None, numeric_only=False): if len(frame._get_axis(axis)) == 0: result = Series(0, index=frame._get_agg_axis(axis)) else: - if frame._is_mixed_type: + if frame._is_mixed_type or frame._data.any_extension_types: result = notna(frame).sum(axis=axis) else: counts = notna(frame.values).sum(axis=axis) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4d328bc8af189..fe59c0fe6ee2b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -103,6 +103,7 @@ class Block(PandasObject): is_object = False is_categorical = False is_sparse = False + is_extension = False _box_to_block_values = True _can_hold_na = False _can_consolidate = True @@ -1858,6 +1859,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + is_extension = True def __init__(self, values, placement, ndim=None): values = self._maybe_coerce_values(values) @@ -3727,6 +3729,11 @@ def is_datelike_mixed_type(self): self._consolidate_inplace() return any(block.is_datelike for block in self.blocks) + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + @property def is_view(self): """ return a boolean if we are a single block and are a view """ diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 939fb6fd05bc2..a2f87d3bfb302 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -405,3 +405,35 @@ def test_value_counts(self, all_data, dropna): expected = pd.Series(other).value_counts(dropna=dropna).sort_index() tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis='columns') + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + tm.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis='columns') + expected = pd.DataFrame(index=[0, 1]) + tm.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, + "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) From 8358fb10de6bc49d7f435e1332aabe9d5a31b85b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:15:50 -0600 Subject: [PATCH 030/119] Restore xfail test --- pandas/tests/extension_arrays/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index a2f87d3bfb302..7a5a3e788b384 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -142,6 +142,14 @@ def dataframe_constructor(self, data, from_series): assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_mismatched_index_raises(self, data): + msg = 'Wrong number of items passed 3, placement implies 4' + with tm.assert_raises_regex(ValueError, None) as m: + pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + + assert m.match(msg) + # ------------------------------------------------------------------------ # Reshaping # ------------------------------------------------------------------------ From 8ef34a96c359e2b1798803f83f1193f243d51328 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:16:09 -0600 Subject: [PATCH 031/119] Test Categorical --- pandas/core/arrays/categorical.py | 5 ++ .../extension_arrays/test_categorical.py | 63 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 pandas/tests/extension_arrays/test_categorical.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..6c5b0c9d2be98 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2137,6 +2137,10 @@ def repeat(self, repeats, *args, **kwargs): def _can_hold_na(self): return True + @property + def _fill_value(self): + return np.nan + @classmethod def _concat_same_type(self, to_concat): from pandas.core.dtypes.concat import _concat_categorical @@ -2146,6 +2150,7 @@ def _concat_same_type(self, to_concat): def _formatting_values(self): return self + # The Series.cat accessor diff --git a/pandas/tests/extension_arrays/test_categorical.py b/pandas/tests/extension_arrays/test_categorical.py new file mode 100644 index 0000000000000..237963bc38415 --- /dev/null +++ b/pandas/tests/extension_arrays/test_categorical.py @@ -0,0 +1,63 @@ +import string + +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas.api.types import CategoricalDtype +from pandas import Categorical +from .base import BaseArrayTests, BaseDtypeTests + + +class TestCategoricalDtype(BaseDtypeTests): + @pytest.fixture + def dtype(self): + return CategoricalDtype() + + +def make_data(): + return np.random.choice(list(string.ascii_letters), size=100) + + +class TestCategoricalArray(BaseArrayTests): + + @pytest.fixture + def data(self): + """Length-100 PeriodArray for semantics test.""" + return Categorical(make_data()) + + @pytest.fixture + def data_missing(self): + """Length 2 array with [NA, Valid]""" + return Categorical([np.nan, 'A']) + + @pytest.mark.skip(reason="Memory usage doesn't match") + def test_memory_usage(self): + # Is this deliberate? + pass + + @pytest.mark.skip(reason="Backwards compatability") + def test_getitem_scalar(self): + # CategoricalDtype.type isn't "correct" since it should + # be a parent of the elements (object). But don't want + # to break things by changing. + pass + + def test_align(self, data): + # Override to pass through dtype + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # TODO: assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [data._fill_value], + dtype=data.dtype)) + e2 = pd.Series(type(data)([data._fill_value] + list(b), + dtype=data.dtype)) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + @pytest.mark.skip(reason="Different value_counts semantics.") + def test_value_counts(self, all_data, dropna): + pass From 340d11be7b4415238a7a89fea539abee7c07e338 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:30:20 -0600 Subject: [PATCH 032/119] Xfail setitem tests --- pandas/core/indexing.py | 5 +++-- pandas/core/internals.py | 12 +++--------- pandas/tests/extension_arrays/base.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8f0f88a24552d..515fe7d3b1d9d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,8 +618,9 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): - # TODO (maybe) this causes issues with setting for - # extensionarrays that store dicts. + # TODO: ExtensionBlock.setitem this causes issues with setting for + # extensionarrays that store dicts. Need to decide if it's worth + # supporting that case. value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fe59c0fe6ee2b..b778900157743 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1941,12 +1941,6 @@ def _slice(self, slicer): return self.values[slicer] - def setitem(self, indexer, value, mgr=None): - if isinstance(indexer, tuple): - indexer = indexer[0] - self.values[indexer] = value - return self - def formatting_values(self): return self.values._formatting_values() @@ -3482,11 +3476,11 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, else: align_keys = [] + # TODO: may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. aligned_args = dict((k, kwargs[k]) for k in align_keys - if hasattr(kwargs[k], 'values') - # eww - and not isinstance(kwargs[k], ExtensionArray)) + if hasattr(kwargs[k], 'values')) for b in self.blocks: if filter is not None: diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension_arrays/base.py index 7a5a3e788b384..dc9bca653e6f3 100644 --- a/pandas/tests/extension_arrays/base.py +++ b/pandas/tests/extension_arrays/base.py @@ -288,11 +288,13 @@ def test_take_sequence(self, data): # Indexing - Setting # ------------------------------------------------------------------------ + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_setitem_scalar(self, data): arr = pd.Series(data) arr[0] = data[1] assert arr[0] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_setitem_sequence(self, data): arr = pd.Series(data) original = data.copy() @@ -301,6 +303,7 @@ def test_setitem_sequence(self, data): assert arr[0] == original[1] assert arr[1] == original[0] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_setitem_sequence_broadcasts(self, data): arr = pd.Series(data) @@ -308,6 +311,7 @@ def test_setitem_sequence_broadcasts(self, data): assert arr[0] == data[2] assert arr[1] == data[2] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") @pytest.mark.parametrize('setter', ['loc', 'iloc']) def test_set_scalar(self, data, setter): arr = pd.Series(data) @@ -315,36 +319,43 @@ def test_set_scalar(self, data, setter): operator.setitem(setter, 0, data[1]) assert arr[0] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_loc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) df.loc[0, 'B'] = data[1] assert df.loc[0, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_loc_scalar_single(self, data): df = pd.DataFrame({"B": data}) df.loc[10, 'B'] = data[1] assert df.loc[10, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_loc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) df.loc[10, 'B'] = data[1] assert df.loc[10, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_iloc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) df.iloc[0, 1] = data[1] assert df.loc[0, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_iloc_scalar_single(self, data): df = pd.DataFrame({"B": data}) df.iloc[10, 0] = data[1] assert df.loc[10, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_iloc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) df.iloc[10, 1] = data[1] assert df.loc[10, 'B'] == data[1] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_mask_aligned(self, data): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) @@ -354,6 +365,7 @@ def test_set_mask_aligned(self, data): assert ser[0] == data[5] assert ser[1] == data[6] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_set_mask_broadcast(self, data): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) @@ -363,6 +375,7 @@ def test_set_mask_broadcast(self, data): assert ser[0] == data[10] assert ser[1] == data[10] + @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) df['B'] = 1 From 82978886564fb299462b4a5752ff9ca9a47a48c3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:34:55 -0600 Subject: [PATCH 033/119] TST: Skip JSON tests on py2 --- pandas/tests/extension_arrays/test_json.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension_arrays/test_json.py index b4c23de6852b1..515272a4850f9 100644 --- a/pandas/tests/extension_arrays/test_json.py +++ b/pandas/tests/extension_arrays/test_json.py @@ -16,6 +16,10 @@ from .base import BaseArrayTests, BaseDtypeTests +pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, + reason="Py2 doesn't have a UserDict") + + class JSONDtype(ExtensionDtype): type = collections.Mapping name = 'json' From 9b8d2a51857a4d8c78ce09c6e54097ab9eddbb08 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:54:19 -0600 Subject: [PATCH 034/119] Additional testing --- pandas/tests/test_base.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 5a67aa3f989ae..0dbced114ce51 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1211,16 +1211,17 @@ def test_unique_datetime_series(arr, expected): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('array, expected_type', [ - (np.array([0, 1]), np.ndarray), - (np.array(['a', 'b']), np.ndarray), - (pd.Categorical(['a', 'b']), pd.Categorical), - (pd.DatetimeIndex(['2017', '2018']), np.ndarray), - (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray), - (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex), +@pytest.mark.parametrize('array, expected_type, dtype', [ + (np.array([0, 1]), np.ndarray, 'int64'), + (np.array(['a', 'b']), np.ndarray, 'object'), + (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, + 'datetime64[ns, US/Central]'), ]) -def test_values_consistent(array, expected_type): +def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values r_values = pd.Index(array)._values assert type(l_values) is expected_type @@ -1234,3 +1235,13 @@ def test_values_consistent(array, expected_type): tm.assert_categorical_equal(l_values, r_values) else: raise TypeError("Unexpected type {}".format(type(l_values))) + + assert l_values.dtype == dtype + assert r_values.dtype == dtype + + +def test_values_periodindex(): + arr = pd.period_range("2017", periods=4, freq='D') + result = arr._values + expected = np.array(arr.astype(object)) + tm.assert_numpy_array_equal(result, expected) From 9fbac2959dc34f64133b44fa8274189abcc07655 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 13:55:34 -0600 Subject: [PATCH 035/119] More tests --- pandas/tests/test_base.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 0dbced114ce51..94449663b580b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1216,10 +1216,11 @@ def test_unique_datetime_series(arr, expected): (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), - (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, 'datetime64[ns, US/Central]'), + (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values @@ -1245,3 +1246,24 @@ def test_values_periodindex(): result = arr._values expected = np.array(arr.astype(object)) tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('array, expected', [ + (np.array([0, 1]), np.array([0, 1])), + (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), + (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), + (pd.DatetimeIndex(['2017-01-01T00:00:00']), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), + (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), + np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), + (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), + pytest.mark.xfail(reason='PeriodArray not implemented')(( + pd.PeriodIndex(['2017', '2018'], freq='D'), + np.array([17167, 17532]), + )), +]) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) From 55305dc197cf7444aa50eab3ba426d5b7244672a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 14:29:08 -0600 Subject: [PATCH 036/119] ndarray_values --- pandas/core/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index 62f237e253c96..dd950a7b8ff00 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -772,6 +772,11 @@ def base(self): def _ndarray_values(self): """The data as an ndarray. See '_values' for more.""" # type: () -> np.ndarray + from pandas.core.dtypes.common import is_categorical_dtype + + if is_categorical_dtype(self): + return self._values.codes + return self.values @property From 0e637086e1e89ed7c580e5b731b030d524431a34 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:01:28 -0600 Subject: [PATCH 037/119] API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a0672380e210d3cb3c527fa8045a204d81be) --- pandas/core/arrays/base.py | 30 +++++++++++++++++ pandas/tests/extension_arrays/test_common.py | 34 ++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 pandas/tests/extension_arrays/test_common.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1556b653819a6..8c3d033dffba7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,6 @@ """An interface for extending pandas with custom arrays.""" +import numpy as np + from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." @@ -138,6 +140,34 @@ def nbytes(self): # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): + """Cast to a NumPy array with 'dtype'. + + The default implementation only allows casting to 'object' dtype. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + """ + np_dtype = np.dtype(dtype) + + if np_dtype != 'object': + msg = ("{} can only be coerced to 'object' dtype, " + "not '{}'.").format(type(self).__name__, dtype) + raise ValueError(msg) + + return np.array(self, dtype=np_dtype, copy=copy) + def isna(self): # type: () -> np.ndarray """Boolean NumPy array indicating if each value is missing. diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py new file mode 100644 index 0000000000000..7feb7fdf09ec6 --- /dev/null +++ b/pandas/tests/extension_arrays/test_common.py @@ -0,0 +1,34 @@ +import numpy as np + +import pandas.util.testing as tm +from pandas.core.arrays import ExtensionArray + + +class DummyArray(ExtensionArray): + + def __init__(self, data): + self.data = data + + def __array__(self, dtype): + return self.data + + +def test_astype(): + arr = DummyArray(np.array([1, 2, 3])) + expected = np.array([1, 2, 3], dtype=object) + + result = arr.astype(object) + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype('object') + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_raises(): + arr = DummyArray(np.array([1, 2, 3])) + + xpr = ("DummyArray can only be coerced to 'object' dtype, not " + "''") + + with tm.assert_raises_regex(ValueError, xpr): + arr.astype(int) From fbbbc8a08b9bfe66cbe06621795163d65dbd3c77 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:22:43 -0600 Subject: [PATCH 038/119] Simplify concat_as_object --- pandas/core/dtypes/concat.py | 10 +++++++--- pandas/tests/indexes/test_category.py | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d6b55d03ebccd..b36dc03bbc82b 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -488,8 +488,11 @@ def _concat_index_asobject(to_concat, name=None): concat all inputs as object. DatetimeIndex, TimedeltaIndex and PeriodIndex are converted to object dtype before concatenation """ + from pandas import Index + from pandas.core.arrays import ExtensionArray - klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, + ExtensionArray) to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] @@ -497,8 +500,9 @@ def _concat_index_asobject(to_concat, name=None): attribs = self._get_attributes_dict() attribs['name'] = name - arrays = [np.array(x, copy=False, dtype=object) for x in to_concat] - return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) + to_concat = [x._values if isinstance(x, Index) else x + for x in to_concat] + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) def _concat_sparse(to_concat, axis=0, typs=None): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2e40c79f8914..e9fddfde90348 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -353,6 +353,14 @@ def test_append(self): expected = Index(list('caaabbca')) tm.assert_index_equal(result, expected, exact=True) + def test_append_to_another(self): + # hits _concat_index_asobject + fst = Index(['a', 'b']) + snd = CategoricalIndex(['d', 'e']) + result = fst.append(snd) + expected = Index(['a', 'b', 'd', 'e']) + tm.assert_index_equal(result, expected) + def test_insert(self): ci = self.create_index() From 46a0a49352a1242077e616056f802b0ce35eb8d9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:46:36 -0600 Subject: [PATCH 039/119] Py2 compat (cherry picked from commit b20e12cae68dd86ff51597464045656763d369f7) --- pandas/tests/extension_arrays/test_common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py index 7feb7fdf09ec6..f19754482b04f 100644 --- a/pandas/tests/extension_arrays/test_common.py +++ b/pandas/tests/extension_arrays/test_common.py @@ -27,8 +27,10 @@ def test_astype(): def test_astype_raises(): arr = DummyArray(np.array([1, 2, 3])) + # type int for py2 + # class int for py3 xpr = ("DummyArray can only be coerced to 'object' dtype, not " - "''") + "'<.* 'int'>'") with tm.assert_raises_regex(ValueError, xpr): arr.astype(int) From 2c4445a365d19979b400295ce6a7c671396cb0da Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 16:30:11 -0600 Subject: [PATCH 040/119] Set-ops ugliness --- pandas/core/indexes/base.py | 52 +++++++++++++-------------------- pandas/core/indexes/category.py | 6 ---- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 260016661a735..3ce3ecce1c140 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,12 +31,14 @@ is_object_dtype, is_categorical_dtype, is_interval_dtype, + is_period_dtype, is_bool, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, is_integer_dtype, is_float_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_timedelta64_dtype, needs_i8_conversion, is_iterator, is_list_like, @@ -2252,15 +2254,15 @@ def union(self, other): other = other.astype('O') return this.union(other) - if is_categorical_dtype(self): - lvals = self.values - else: + # TODO: setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values - - if is_categorical_dtype(other): - rvals = other.values else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): rvals = other._ndarray_values + else: + rvals = other._values if self.is_monotonic and other.is_monotonic: try: @@ -2310,24 +2312,6 @@ def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None return self.__class__(result, name=name) - def _ensure_join(self, values): - """Ensure that the 'values' are ready for our join indexer. - - The default join indexers are object, so this just returns 'values'. - This is called before calling those. - - - Parameters - ---------- - values : array-like - - Returns - ------- - values : ndarray - Expected to have the correct type for self.inner_indexer - """ - return values - def intersection(self, other): """ Form the intersection of two Index objects. @@ -2363,24 +2347,30 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) + # TODO: setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - lvals = self._ensure_join(self._ndarray_values) - rvals = self._ensure_join(other._ndarray_values) result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._ndarray_values).get_indexer( - self._ndarray_values) + indexer = Index(rvals).get_indexer(lvals) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: - # duplicates + # duplicateters indexer = algos.unique1d( - Index(other._ndarray_values).get_indexer_non_unique( - self._ndarray_values)[0]) + Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 4381b35f6cb86..93ed2507cb829 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -805,12 +805,6 @@ def _delegate_method(self, name, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) - def _ensure_join(self, values): - if self.codes.dtype.itemsize <= 4: - return _ensure_int32(values) - else: - return _ensure_int64(values) - @classmethod def _add_accessors(cls): """ add in Categorical accessor methods """ From 5612cda29f77b5865df92bb97c6e7a2abde6bcb6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 16:46:02 -0600 Subject: [PATCH 041/119] better docstrings --- pandas/core/base.py | 9 ++++++++- pandas/core/indexes/base.py | 27 ++++++++++++++++++--------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index dd950a7b8ff00..744d448b16682 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -770,7 +770,14 @@ def base(self): @property def _ndarray_values(self): - """The data as an ndarray. See '_values' for more.""" + """The data as an ndarray, possibly losing information. + + The expectation is that this is cheap to compute. + + - categorical -> codes + + See '_values' for more. + """ # type: () -> np.ndarray from pandas.core.dtypes.common import is_categorical_dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3ce3ecce1c140..afefa5de2477e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -603,15 +603,24 @@ def _values(self): """The best array representation. This is an ndarray, ExtensionArray, or Index subclass. This differs - from '._ndarray_values', which always returns an ndarray. It may differ - from the public '.values' - - index | values | _values - ----------------- | -------------- -| ---------- - CategoricalIndex | Categorical | Categorical - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] - PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) - IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | -------------- -| ----------- | --------------- | + CategoricalIndex | Categorical | Categorical | codes | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | datetime@UTC | + + In the near-future, we'll implement two more. + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ----------- | --------------- | + PeriodIndex | ndarray[object] | PeriodArray | ordinals | + IntervalIndex | ndarray[object] | IVArray | ndarray[object] | See Also -------- From b012c1967b6de548b999514fe4b560ba9b7ee635 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 17:03:15 -0600 Subject: [PATCH 042/119] tolist --- pandas/core/base.py | 3 +++ pandas/core/indexes/base.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 744d448b16682..f3b0fb9dbe142 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,6 +14,7 @@ is_list_like, is_scalar, is_datetimelike, + is_categorical_dtype, is_extension_type) from pandas.util._validators import validate_bool_kwarg @@ -833,6 +834,8 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] + elif is_categorical_dtype(self): + return self.values.tolist() else: return self._ndarray_values.tolist() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index afefa5de2477e..9eb0ac1276280 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -617,10 +617,10 @@ def _values(self): In the near-future, we'll implement two more. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ----------- | --------------- | - PeriodIndex | ndarray[object] | PeriodArray | ordinals | - IntervalIndex | ndarray[object] | IVArray | ndarray[object] | + index | values | _values | ndarray_values | + ----------------- | --------------- | ----------- | -------------- | + PeriodIndex | ndarray[object] | PeriodArray + IntervalIndex | IntervalArray | ndarray[Interval] See Also -------- From d49e6aa649a0b02ce612b9d18b663668ade6485a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 17:05:46 -0600 Subject: [PATCH 043/119] linting --- pandas/core/indexes/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 788005531efe1..22ce690b3d420 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1105,7 +1105,8 @@ def unique(self, level=None): else: naive = self result = super(DatetimeIndex, naive).unique(level=level) - return self._simple_new(result, name=self.name, tz=self.tz, freq=self.freq) + return self._simple_new(result, name=self.name, tz=self.tz, + freq=self.freq) def union(self, other): """ From d7d31eecc1411f9d68755bd86f80b2a97a34776e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 08:21:51 -0600 Subject: [PATCH 044/119] Moved dtypes (cherry picked from commit d1362271bca8a7b183f3241e5c2f040c422118b8) --- pandas/tests/dtypes/test_dtypes.py | 32 +------------------- pandas/tests/extension_arrays/test_common.py | 29 ++++++++++++++++++ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index eca4dd4cf2106..d800a7b92b559 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,14 +10,12 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype, ExtensionDtype) + IntervalDtype, CategoricalDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, - is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -744,31 +742,3 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) - - -class DummyArray(ExtensionArray): - pass - - -class DummyDtype(ExtensionDtype): - pass - - -class TestExtensionArrayDtype(object): - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) - - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py index f19754482b04f..1fc4526aff951 100644 --- a/pandas/tests/extension_arrays/test_common.py +++ b/pandas/tests/extension_arrays/test_common.py @@ -1,7 +1,15 @@ import numpy as np +import pytest +import pandas as pd import pandas.util.testing as tm from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class DummyDtype(ExtensionDtype): + pass class DummyArray(ExtensionArray): @@ -13,7 +21,28 @@ def __array__(self, dtype): return self.data +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) + + def test_astype(): + arr = DummyArray(np.array([1, 2, 3])) expected = np.array([1, 2, 3], dtype=object) From 7b89f1b3dc80c23d02c8b57c9c5d94cd491082c8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 08:36:44 -0600 Subject: [PATCH 045/119] clean --- pandas/tests/extension_arrays/test_common.py | 65 -------------------- 1 file changed, 65 deletions(-) delete mode 100644 pandas/tests/extension_arrays/test_common.py diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py deleted file mode 100644 index 1fc4526aff951..0000000000000 --- a/pandas/tests/extension_arrays/test_common.py +++ /dev/null @@ -1,65 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm -from pandas.core.arrays import ExtensionArray -from pandas.core.dtypes.common import is_extension_array_dtype -from pandas.core.dtypes.dtypes import ExtensionDtype - - -class DummyDtype(ExtensionDtype): - pass - - -class DummyArray(ExtensionArray): - - def __init__(self, data): - self.data = data - - def __array__(self, dtype): - return self.data - - -class TestExtensionArrayDtype(object): - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(np.array([1, 2])), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) - - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) - - -def test_astype(): - - arr = DummyArray(np.array([1, 2, 3])) - expected = np.array([1, 2, 3], dtype=object) - - result = arr.astype(object) - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype('object') - tm.assert_numpy_array_equal(result, expected) - - -def test_astype_raises(): - arr = DummyArray(np.array([1, 2, 3])) - - # type int for py2 - # class int for py3 - xpr = ("DummyArray can only be coerced to 'object' dtype, not " - "'<.* 'int'>'") - - with tm.assert_raises_regex(ValueError, xpr): - arr.astype(int) From b0dbffd72376d88bfc1dd8d4d89c890978686d4e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 09:34:39 -0600 Subject: [PATCH 046/119] cleanup --- pandas/core/indexes/base.py | 10 +++--- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/tests/test_base.py | 56 ++++++++++++++++++++++++++++----- 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9eb0ac1276280..d8b4a65a91ecc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -613,14 +613,14 @@ def _values(self): index | values | _values | _ndarray_values | ----------------- | -------------- -| ----------- | --------------- | CategoricalIndex | Categorical | Categorical | codes | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | datetime@UTC | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | In the near-future, we'll implement two more. - index | values | _values | ndarray_values | - ----------------- | --------------- | ----------- | -------------- | - PeriodIndex | ndarray[object] | PeriodArray - IntervalIndex | IntervalArray | ndarray[Interval] + index | values | _values | _ndarray_values | + ----------------- | --------------- | ----------- | --------------- | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | ndarray[object] | PeriodArray | ndarray[object] | See Also -------- diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 93ed2507cb829..166832cbe6bb1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -797,7 +797,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ - method = getattr(self.values, name) + method = getattr(self._values, name) if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1478012aa9dbe..a257a1ba26128 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,7 +799,7 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, + taken = lev._box_values(algos.take_1d(lev._ndarray_values, lab)) elif box: taken = algos.take_1d(lev._box_values(lev._ndarray_values), diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 94449663b580b..66ec2d37c680e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1241,13 +1241,6 @@ def test_values_consistent(array, expected_type, dtype): assert r_values.dtype == dtype -def test_values_periodindex(): - arr = pd.period_range("2017", periods=4, freq='D') - result = arr._values - expected = np.array(arr.astype(object)) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize('array, expected', [ (np.array([0, 1]), np.array([0, 1])), (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), @@ -1267,3 +1260,52 @@ def test_ndarray_values(array, expected): r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) + + +def test_values_multiindex_datetimesindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + +def test_values_multiindex_datetimesindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) From 66b936f00b72e3152df807e6e5913f1111084cef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 09:42:37 -0600 Subject: [PATCH 047/119] NumPy compat --- pandas/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 66ec2d37c680e..e649667e3dda1 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1196,7 +1196,7 @@ def test_unique_datetime_index(arr, expected): @pytest.mark.parametrize('arr, expected', [ (pd.Series(pd.DatetimeIndex(['2017', '2017'])), - np.array(['2017'], dtype='M8[ns]')), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), ]) From 32ee0eff6893bd02ed1469330054b0c37914306e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:10:15 -0600 Subject: [PATCH 048/119] Use base _values for CategoricalIndex --- pandas/core/indexes/category.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 166832cbe6bb1..f03f8571121f0 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -304,10 +304,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _values(self): - return self._data - @property def _ndarray_values(self): return self._data.codes From a9882e23defc47272f941932c4ce53af9b5ba0e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:10:34 -0600 Subject: [PATCH 049/119] Update dev docs --- doc/source/internals.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 29aaed318b802..957f82fd9eba7 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -92,16 +92,20 @@ if you compute the levels and labels yourself, please be careful. Values ~~~~~~ -Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around. -For 1-D containers (``Index`` classes and ``Series``) we have the following convention: - -* ``cls._ndarray_values`` is *always* and ``ndarray`` -* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or - in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's - always an ``ndarray`` or ``ExtensionArray``). - -So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is -the underlying ndarray. +Pandas extends NumPy's type system with custom types, like ``Categorical`` or +datetimes with a timezone, so we have multiple notions of "values". For 1-D +containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, + ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, + this returns the codes, not the array of objects. +* ``cls._values`` refers is the "best possible" array. This could be an + ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the + process of removing the index subclasses here so that it's always an + ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while +``Series[category]._ndarray_values`` is the underlying codes. .. _ref-subclassing-pandas: From 242562108b099b4e7a205541ee15b9272dcb5265 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:59:22 -0600 Subject: [PATCH 050/119] cleanup --- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/category.py | 13 +------------ pandas/core/indexes/multi.py | 8 +++----- pandas/core/indexes/period.py | 5 ----- 4 files changed, 5 insertions(+), 23 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b2816343fc8eb..55919fb2bea0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -927,7 +927,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta try: - return to_timedelta(v)._values.reshape(shape) + return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: return v.reshape(shape) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f03f8571121f0..5aa940499a368 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,6 +1,5 @@ import numpy as np from pandas._libs import index as libindex -from pandas._libs import join as libjoin from pandas import compat from pandas.compat.numpy import function as nv @@ -9,8 +8,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, _ensure_platform_int, - _ensure_int32, - _ensure_int64, is_list_like, is_interval_dtype, is_scalar) @@ -217,14 +214,6 @@ def _shallow_copy(self, values=None, categories=None, ordered=None, values=values, categories=categories, ordered=ordered, **kwargs) - @cache_readonly - def _inner_indexer(self): - if self.codes.dtype.itemsize <= 4: - # int8, int16, int32 - return libjoin.inner_join_indexer_int32 - else: - return libjoin.inner_join_indexer_int64 - def _is_dtype_compat(self, other): """ *this is an internal non-public method* @@ -238,7 +227,7 @@ def _is_dtype_compat(self, other): """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): - other = other.values + other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a257a1ba26128..907bbb2e8762e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2507,6 +2507,7 @@ def get_locs(self, seq): MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). """ + from .numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -2532,7 +2533,6 @@ def _convert_to_indexer(r): "that is not the same length as the " "index") r = r.nonzero()[0] - from .numeric import Int64Index return Int64Index(r) def _update_indexer(idxr, indexer=indexer): @@ -2569,7 +2569,6 @@ def _update_indexer(idxr, indexer=indexer): if indexers is not None: indexer = _update_indexer(indexers, indexer=indexer) else: - from .numeric import Int64Index # no matches we are done return Int64Index([])._ndarray_values @@ -2652,9 +2651,8 @@ def equals(self, other): for i in range(self.nlevels): slabels = self.labels[i] slabels = slabels[slabels != -1] - svalues = algos.take_nd( - np.asarray(self.levels[i]._values), - slabels, allow_fill=False) + svalues = algos.take_nd(np.asarray(self.levels[i]._values), + slabels, allow_fill=False) olabels = other.labels[i] olabels = olabels[olabels != -1] diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c8b7d6063e378..e90d3827fe84e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -418,11 +418,6 @@ def _int64index(self): def values(self): return self.astype(object).values - @property - def _values(self): - # TODO: return PeriodArray - return self.values - @property def _ndarray_values(self): # Ordinals From 6abe9da01ee0be4bb2d87f649b2c6066d4ea3835 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:59:22 -0600 Subject: [PATCH 051/119] cleanup (cherry picked from commit 242562108b099b4e7a205541ee15b9272dcb5265) --- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/multi.py | 3 +-- pandas/core/indexes/period.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b2816343fc8eb..55919fb2bea0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -927,7 +927,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta try: - return to_timedelta(v)._values.reshape(shape) + return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: return v.reshape(shape) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 510f7245cebd8..aca81aed29c62 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2505,6 +2505,7 @@ def get_locs(self, seq): MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). """ + from .numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -2530,7 +2531,6 @@ def _convert_to_indexer(r): "that is not the same length as the " "index") r = r.nonzero()[0] - from .numeric import Int64Index return Int64Index(r) def _update_indexer(idxr, indexer=indexer): @@ -2567,7 +2567,6 @@ def _update_indexer(idxr, indexer=indexer): if indexers is not None: indexer = _update_indexer(indexers, indexer=indexer) else: - from .numeric import Int64Index # no matches we are done return Int64Index([])._values diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1f8542ed5ee60..b797d3734380c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -418,7 +418,8 @@ def values(self): return self.astype(object).values @property - def _values(self): + def _ndarray_values(self): + # Ordinals return self._data def __array__(self, dtype=None): From 0b112f21a80818d3ad9e7bb6f00c351edd9d1713 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:26:52 -0600 Subject: [PATCH 052/119] cleanup --- pandas/core/arrays/categorical.py | 1 - pandas/core/dtypes/common.py | 5 ++--- pandas/core/dtypes/missing.py | 4 +--- pandas/core/frame.py | 3 +-- pandas/core/indexes/base.py | 25 ------------------------- pandas/core/indexes/category.py | 3 --- pandas/core/indexes/datetimes.py | 7 ------- pandas/core/internals.py | 7 ++----- pandas/core/series.py | 9 +++++---- pandas/tests/indexes/datetimelike.py | 13 ------------- pandas/tests/indexes/test_base.py | 5 ----- pandas/tests/indexes/test_category.py | 5 ----- 12 files changed, 11 insertions(+), 76 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6c5b0c9d2be98..d5e8fc5e0b190 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2150,7 +2150,6 @@ def _concat_same_type(self, to_concat): def _formatting_values(self): return self - # The Series.cat accessor diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2344091f85a88..197b35de88896 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1708,10 +1708,9 @@ def is_extension_array_dtype(arr_or_dtype): """ from pandas.core.arrays import ExtensionArray - if isinstance(arr_or_dtype, ABCSeries): + if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values - elif isinstance(arr_or_dtype, ABCIndexClass): - arr_or_dtype = arr_or_dtype._as_best_array() + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index c7cd97d5ceb87..002839af6daf2 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -132,9 +132,7 @@ def _isna_ndarraylike(obj): dtype = values.dtype if is_extension_array_dtype(obj): - if isinstance(obj, ABCIndexClass): - values = obj._as_best_array() - elif isinstance(obj, ABCSeries): + if isinstance(obj, (ABCIndexClass, ABCSeries)): values = obj._values else: values = obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7059495ed6467..e91a93827fc84 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3370,8 +3370,7 @@ class max type new_obj = self.copy() def _maybe_casted_values(index, labels=None): - values = index._as_best_array() - # TODO: Check if nescessary... + values = index._values if not isinstance(index, (PeriodIndex, DatetimeIndex)): if values.dtype == np.object_: values = lib.maybe_convert_objects(values) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6c2b2dc1eb67a..fdb20995805fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1095,31 +1095,6 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() - def _as_best_array(self): - # type: () -> Union[ExtensionArray, ndarary] - """Return the underlying values as the best array type. - - Indexes backed by ExtensionArrays will return the ExtensionArray. - Otherwise, an ndarray is returned. - - Examples - -------- - >>> pd.Index([0, 1, 2])._as_best_array() - array([0, 1, 2]) - - >>> pd.CategoricalIndex(['a', 'a', 'b'])._as_best_array() - [a, a, b] - Categories (2, object): [a, b] - - >>> pd.IntervalIndex.from_breaks([0, 1, 2])._as_best_array() - IntervalArray([(0, 1], (1, 2]]) - """ - # We need this since CategoricalIndex.values -> Categorical - # but IntervalIndex.values -> ndarray[object] - # TODO: IntervalIndex defines _array_values. Would be nice to - # have an unambiguous way of getting an ndarray (or just use asarray?) - return self.values - _index_shared_docs['astype'] = """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a ValueError diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f211d41ac2f4c..5aa940499a368 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -310,9 +310,6 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() - def _as_best_array(self): - return self._data - def tolist(self): return self._data.tolist() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index db66dac67bbea..22ce690b3d420 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1043,13 +1043,6 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.values.copy() - def _as_best_array(self): - # no-tz -> ndarray - # tz -> DatetimeIndex (for now) - if self.tz is not None: - return self - return self.values - def to_pydatetime(self): """ Return DatetimeIndex as object ndarray of datetime.datetime objects diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b778900157743..c69ce53fbf53f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1868,10 +1868,8 @@ def __init__(self, values, placement, ndim=None): def _maybe_coerce_values(self, values): # Unboxes Series / Index # Doesn't change any underlying dtypes. - if isinstance(values, ABCSeries): + if isinstance(values, (ABCIndexClass, ABCSeries)): values = values.values - elif isinstance(values, ABCIndexClass): - values = values._as_best_array() return values @property @@ -4133,8 +4131,7 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_extension_type = (is_extension_type(value) or - is_extension_array_dtype(value)) + value_is_extension_type = is_extension_type(value) # categorical/spares/datetimetz if value_is_extension_type: diff --git a/pandas/core/series.py b/pandas/core/series.py index 41240dcbda0d1..7884794c6b5f4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -181,7 +181,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data.astype(dtype) # need to copy to avoid aliasing issues - data = data._as_best_array().copy() + data = data._values.copy() + copy = False elif isinstance(data, np.ndarray): pass @@ -3137,7 +3138,9 @@ def _sanitize_index(data, index, copy=False): raise ValueError('Length of values does not match length of ' 'index') if isinstance(data, ABCIndexClass) and not copy: - data = data._as_best_array() + pass + elif isinstance(data, (PeriodIndex, DatetimeIndex)): + data = data._values elif isinstance(data, np.ndarray): @@ -3216,7 +3219,6 @@ def _try_cast(arr, take_fast_path): if copy: subarr = data.copy() - # XXX: This is the only early return. See if it can be avoided. return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: @@ -3239,7 +3241,6 @@ def _try_cast(arr, take_fast_path): start, stop, step = get_range_parameters(data) arr = np.arange(start, stop, step, dtype='int64') subarr = _try_cast(arr, False) - else: subarr = _try_cast(data, False) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 64fc1ee8c9680..7d01a2a70145d 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -83,16 +83,3 @@ def test_asobject_deprecated(self): with tm.assert_produces_warning(FutureWarning): i = d.asobject assert isinstance(i, pd.Index) - - def test_as_best_array(self): - result = pd.DatetimeIndex(['2017-01-01T00:00:00', - '2017-01-02T00:00:00'])._as_best_array() - expected = np.array(['2017-01-01T00:00:00', - '2017-01-02T00:00:00'], dtype='M8[ns]') - tm.assert_numpy_array_equal(result, expected) - - def test_as_best_array_tz(self): - arr = pd.DatetimeIndex(['2017-01-01T00:00:00', - '2017-01-02T00:00:00'], tz='US/Central') - result = arr._as_best_array() - tm.assert_index_equal(arr, result) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 900e413b2c2db..90edcb526bb2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2284,11 +2284,6 @@ def test_comparison_tzawareness_compat(self, op): # TODO: implement _assert_tzawareness_compat for the reverse # comparison with the Series on the left-hand side - def test_as_best_array(self): - result = pd.Index([0, 1, 2])._as_best_array() - expected = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(result, expected) - class TestIndexUtils(object): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 0fda05252c74e..e9fddfde90348 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -1088,8 +1088,3 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') - - def test_as_best_array(self): - result = pd.CategoricalIndex([0, 1, 2])._as_best_array() - expected = pd.Categorical([0, 1, 2]) - tm.assert_categorical_equal(result, expected) From 170d0c7959a54276fff730b002195f46ec64de63 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:33:49 -0600 Subject: [PATCH 053/119] Linting --- pandas/core/base.py | 3 +-- pandas/tests/test_base.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f3b0fb9dbe142..01dba132e00c5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,8 +7,7 @@ import numpy as np from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( is_object_dtype, is_list_like, diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index e649667e3dda1..31fa278f906f5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1262,7 +1262,7 @@ def test_ndarray_values(array, expected): tm.assert_numpy_array_equal(l_values, expected) -def test_values_multiindex_datetimesindex(): +def test_values_multiindex_datetimeindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(10**18, 10**18 + 5) naive = pd.DatetimeIndex(ints) @@ -1287,7 +1287,7 @@ def test_values_multiindex_datetimesindex(): tm.assert_index_equal(inner, aware[:2]) -def test_values_multiindex_datetimesindex(): +def test_values_multiindex_periodindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) pidx = pd.PeriodIndex(ints, freq='D') From 402620f3ca75d14dd203f809226ec528113ae54c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:35:24 -0600 Subject: [PATCH 054/119] Precision in tests --- pandas/tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 31fa278f906f5..ce1e3d492741d 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1212,7 +1212,7 @@ def test_unique_datetime_series(arr, expected): @pytest.mark.parametrize('array, expected_type, dtype', [ - (np.array([0, 1]), np.ndarray, 'int64'), + (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), @@ -1242,7 +1242,7 @@ def test_values_consistent(array, expected_type, dtype): @pytest.mark.parametrize('array, expected', [ - (np.array([0, 1]), np.array([0, 1])), + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), (pd.DatetimeIndex(['2017-01-01T00:00:00']), From 268aabcb88f8fcb803693bd5796b0cfcf244fab2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:42:26 -0600 Subject: [PATCH 055/119] Linting --- pandas/core/indexing.py | 6 +++--- pandas/core/internals.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 97b7d1064e8bc..1d07900a4d5df 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,9 +618,9 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): - # TODO: ExtensionBlock.setitem this causes issues with setting for - # extensionarrays that store dicts. Need to decide if it's worth - # supporting that case. + # TODO: ExtensionBlock.setitem this causes issues with setting + # for extensionarrays that store dicts. Need to decide if it's + # worth supporting that or now value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c69ce53fbf53f..fffbe18d3008c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,7 +15,6 @@ from pandas.core.base import PandasObject -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( ExtensionDtype, DatetimeTZDtype, CategoricalDtype) From d671259c25413d849ae015e13d9db195aa467876 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:47:22 -0600 Subject: [PATCH 056/119] Move to extension --- pandas/tests/{extension_arrays => extension}/__init__.py | 0 pandas/tests/{extension_arrays => extension}/base.py | 0 pandas/tests/{extension_arrays => extension}/test_categorical.py | 0 pandas/tests/{extension_arrays => extension}/test_decimal.py | 0 pandas/tests/{extension_arrays => extension}/test_json.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{extension_arrays => extension}/__init__.py (100%) rename pandas/tests/{extension_arrays => extension}/base.py (100%) rename pandas/tests/{extension_arrays => extension}/test_categorical.py (100%) rename pandas/tests/{extension_arrays => extension}/test_decimal.py (100%) rename pandas/tests/{extension_arrays => extension}/test_json.py (100%) diff --git a/pandas/tests/extension_arrays/__init__.py b/pandas/tests/extension/__init__.py similarity index 100% rename from pandas/tests/extension_arrays/__init__.py rename to pandas/tests/extension/__init__.py diff --git a/pandas/tests/extension_arrays/base.py b/pandas/tests/extension/base.py similarity index 100% rename from pandas/tests/extension_arrays/base.py rename to pandas/tests/extension/base.py diff --git a/pandas/tests/extension_arrays/test_categorical.py b/pandas/tests/extension/test_categorical.py similarity index 100% rename from pandas/tests/extension_arrays/test_categorical.py rename to pandas/tests/extension/test_categorical.py diff --git a/pandas/tests/extension_arrays/test_decimal.py b/pandas/tests/extension/test_decimal.py similarity index 100% rename from pandas/tests/extension_arrays/test_decimal.py rename to pandas/tests/extension/test_decimal.py diff --git a/pandas/tests/extension_arrays/test_json.py b/pandas/tests/extension/test_json.py similarity index 100% rename from pandas/tests/extension_arrays/test_json.py rename to pandas/tests/extension/test_json.py From 815d202e96e910a64a292f6815737447ffdc1847 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:13:50 -0600 Subject: [PATCH 057/119] Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. --- pandas/core/arrays/base.py | 12 ++++++++++++ pandas/core/arrays/categorical.py | 4 ++++ pandas/core/base.py | 15 ++++++--------- pandas/core/dtypes/common.py | 2 +- pandas/core/indexes/category.py | 4 ---- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 553e1e0ac2066..e618dc6b69b2d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -266,3 +266,15 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True + + @property + def _ndarray_values(self): + # type: () -> np.ndarray + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return np.array(self) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..8d2cf9d2b2f92 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -410,6 +410,10 @@ def dtype(self): """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype + @property + def _ndarray_values(self): + return self.codes + @property def _constructor(self): return Categorical diff --git a/pandas/core/base.py b/pandas/core/base.py index 01dba132e00c5..0e70e3eb64fcb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,7 +14,8 @@ is_scalar, is_datetimelike, is_categorical_dtype, - is_extension_type) + is_extension_type, + is_extension_array_dtype) from pandas.util._validators import validate_bool_kwarg @@ -772,18 +773,14 @@ def base(self): def _ndarray_values(self): """The data as an ndarray, possibly losing information. - The expectation is that this is cheap to compute. + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. - categorical -> codes - - See '_values' for more. """ # type: () -> np.ndarray - from pandas.core.dtypes.common import is_categorical_dtype - - if is_categorical_dtype(self): - return self._values.codes - + if is_extension_array_dtype(self): + return self.values._ndarray_values return self.values @property diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c66e7fcfc6978..c2b71bc316fe8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype): from pandas.core.arrays import ExtensionArray # we want to unpack series, anything else? - if isinstance(arr_or_dtype, ABCSeries): + if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5aa940499a368..d71b7ea774f52 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -293,10 +293,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _ndarray_values(self): - return self._data.codes - @property def itemsize(self): return self.values.itemsize From a727b217f42e959f9ebb355e911f3ec641db0b49 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:27:46 -0600 Subject: [PATCH 058/119] Clean up tolist --- pandas/core/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0e70e3eb64fcb..0b4c03d6b4b25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -830,10 +830,8 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] - elif is_categorical_dtype(self): - return self.values.tolist() else: - return self._ndarray_values.tolist() + return self._values.tolist() def __iter__(self): """ From f368c29d6a45832f95181a8a6e8b7411d87763c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:33:46 -0600 Subject: [PATCH 059/119] Move test locations --- .../tests/indexes/datetimes/test_datetime.py | 15 ++++ pandas/tests/indexes/test_multi.py | 48 +++++++++++ pandas/tests/test_base.py | 82 ------------------- 3 files changed, 63 insertions(+), 82 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a75ace2933b71..e9176e749564e 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -469,3 +469,18 @@ def test_factorize_dst(self): arr, res = obj.factorize() tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) + + @pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), + ]) + def test_unique(self, arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e59456b8a2d5e..97370b279245c 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -962,6 +962,54 @@ def test_values_boxed(self): # Check that code branches for boxed values produce identical results tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + def test_values_multiindex_datetimeindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + + def test_values_multiindex_periodindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + def test_append(self): result = self.index[:3].append(self.index[3:]) assert result.equals(self.index) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index ce1e3d492741d..4b5ad336139b0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1178,39 +1178,6 @@ def test_iter_box(self): assert res == exp -@pytest.mark.parametrize('arr, expected', [ - (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), - (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), - pd.DatetimeIndex(['2017'], tz='US/Eastern')), -]) -def test_unique_datetime_index(arr, expected): - result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) - - -@pytest.mark.parametrize('arr, expected', [ - (pd.Series(pd.DatetimeIndex(['2017', '2017'])), - np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), - (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), - np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), -]) -def test_unique_datetime_series(arr, expected): - result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize('array, expected_type, dtype', [ (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), @@ -1260,52 +1227,3 @@ def test_ndarray_values(array, expected): r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) - - -def test_values_multiindex_datetimeindex(): - # Test to ensure we hit the boxing / nobox part of MI.values - ints = np.arange(10**18, 10**18 + 5) - naive = pd.DatetimeIndex(ints) - aware = pd.DatetimeIndex(ints, tz='US/Central') - - idx = pd.MultiIndex.from_arrays([naive, aware]) - result = idx.values - - outer = pd.DatetimeIndex([x[0] for x in result]) - tm.assert_index_equal(outer, naive) - - inner = pd.DatetimeIndex([x[1] for x in result]) - tm.assert_index_equal(inner, aware) - - # n_lev > n_lab - result = idx[:2].values - - outer = pd.DatetimeIndex([x[0] for x in result]) - tm.assert_index_equal(outer, naive[:2]) - - inner = pd.DatetimeIndex([x[1] for x in result]) - tm.assert_index_equal(inner, aware[:2]) - - -def test_values_multiindex_periodindex(): - # Test to ensure we hit the boxing / nobox part of MI.values - ints = np.arange(2007, 2012) - pidx = pd.PeriodIndex(ints, freq='D') - - idx = pd.MultiIndex.from_arrays([ints, pidx]) - result = idx.values - - outer = pd.Int64Index([x[0] for x in result]) - tm.assert_index_equal(outer, pd.Int64Index(ints)) - - inner = pd.PeriodIndex([x[1] for x in result]) - tm.assert_index_equal(inner, pidx) - - # n_lev > n_lab - result = idx[:2].values - - outer = pd.Int64Index([x[0] for x in result]) - tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) - - inner = pd.PeriodIndex([x[1] for x in result]) - tm.assert_index_equal(inner, pidx[:2]) From d74c5c96040882378e3598e0df27e59aff57de51 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 06:33:05 -0600 Subject: [PATCH 060/119] Fixed test --- pandas/tests/indexes/test_multi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 97370b279245c..cd6a5c761d0c2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -986,8 +986,7 @@ def test_values_multiindex_datetimeindex(self): inner = pd.DatetimeIndex([x[1] for x in result]) tm.assert_index_equal(inner, aware[:2]) - - def test_values_multiindex_periodindex(): + def test_values_multiindex_periodindex(self): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) pidx = pd.PeriodIndex(ints, freq='D') From 8104ee5d8a887454fec6869eb1f4e63fe74d72e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 08:40:56 -0600 Subject: [PATCH 061/119] REF: Update per comments --- pandas/core/base.py | 2 +- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 6 +----- pandas/core/indexes/multi.py | 2 +- pandas/io/formats/format.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 8 +------- 6 files changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0b4c03d6b4b25..8081e20faaeb3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -745,7 +745,7 @@ def itemsize(self): @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self._ndarray_values.nbytes + return self.values.nbytes @property def strides(self): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b36dc03bbc82b..d306d0d78f1f4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -480,7 +480,7 @@ def _concat_datetimetz(to_concat, name=None): def _concat_index_same_dtype(indexes, klass=None): klass = klass if klass is not None else indexes[0].__class__ - return klass(np.concatenate([x._ndarray_values for x in indexes])) + return klass(np.concatenate([x._values for x in indexes])) def _concat_index_asobject(to_concat, name=None): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d71b7ea774f52..7d4a864b465e8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -295,13 +295,9 @@ def values(self): @property def itemsize(self): + # Size of the items in categories, not codes. return self.values.itemsize - @property - def nbytes(self): - """ return the number of bytes in the underlying data """ - return self.values.nbytes - def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 907bbb2e8762e..94dbd8b884e47 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1319,7 +1319,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._ndarray_values + tuples = tuples._values arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d590499faa65e..621641747f376 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1897,7 +1897,7 @@ def _format(x): vals = self.values if isinstance(vals, Index): - vals = vals._ndarray_values + vals = vals._values elif isinstance(vals, ABCSparseArray): vals = vals.values diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e9176e749564e..05678b0c8dd45 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -477,10 +477,4 @@ def test_factorize_dst(self): ]) def test_unique(self, arr, expected): result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) From f8e29b918f7b4cc306ff7b18efa549e17aedbbe9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 09:53:55 -0600 Subject: [PATCH 062/119] lint --- pandas/core/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8081e20faaeb3..cf48b419b7df1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -13,7 +13,6 @@ is_list_like, is_scalar, is_datetimelike, - is_categorical_dtype, is_extension_type, is_extension_array_dtype) From 0cd9faa5b42df01c96a8dddb7f7a73cea32d0a91 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:04:50 -0600 Subject: [PATCH 063/119] REF: Use _values for size and shape --- pandas/core/base.py | 4 ++-- pandas/core/indexes/datetimes.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index cf48b419b7df1..f6f1ba982e1d9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -711,7 +711,7 @@ def transpose(self, *args, **kwargs): @property def shape(self): """ return a tuple of the shape of the underlying data """ - return self._ndarray_values.shape + return self._values.shape @property def ndim(self): @@ -754,7 +754,7 @@ def strides(self): @property def size(self): """ return the number of elements in the underlying data """ - return self._ndarray_values.size + return self._values.size @property def flags(self): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 22ce690b3d420..689610af7603f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -694,6 +694,20 @@ def tzinfo(self): """ return self.tz + @property + def size(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.size + + @property + def shape(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.shape + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" From 8fcdb7040345e1d0017367695354d9c858c71e09 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:09:13 -0600 Subject: [PATCH 064/119] PERF: Implement size, shape for IntervalIndex --- pandas/core/indexes/interval.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3bf783b5a2faa..d431ea1e51e31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -680,6 +680,16 @@ def length(self): 'e.g. Intervals with string endpoints') raise TypeError(msg) + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + # Avoid materializing self.values + return self.left.shape + def __len__(self): return len(self.left) From 34a6a22e2255eb11e5c6b6c5478350fb84ce656e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:11:00 -0600 Subject: [PATCH 065/119] PERF: Avoid materializing values for PeriodIndex shape, size --- pandas/core/indexes/period.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index e90d3827fe84e..8f2d7d382a16e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -477,6 +477,16 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.astype(object).values + @property + def size(self): + # Avoid materializing self._values + return self._ndarray_values.size + + @property + def shape(self): + # Avoid materializing self._values + return self._ndarray_values.shape + @property def _formatter_func(self): return lambda x: "'%s'" % x From d6e8051d1ebab7cf99bd7ac23eea348d0e3a0d4c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 20:55:19 -0600 Subject: [PATCH 066/119] Cleanup --- pandas/core/base.py | 3 +-- pandas/core/indexes/base.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f6f1ba982e1d9..0ca029ffd4c25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -744,7 +744,7 @@ def itemsize(self): @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self.values.nbytes + return self._values.nbytes @property def strides(self): @@ -988,7 +988,6 @@ def value_counts(self, normalize=False, sort=True, ascending=False, def unique(self): values = self._values - # TODO: Make unique part of the ExtensionArray interface. if hasattr(values, 'unique'): result = values.unique() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a09a4c59a819a..be7c1624936bf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -599,7 +599,7 @@ def values(self): @property def _values(self): # type: () -> Union[ExtensionArray, Index] - # TODO: remove index types as they become is extension arrays + # TODO(EA): remove index types as they become extension arrays """The best array representation. This is an ndarray, ExtensionArray, or Index subclass. This differs @@ -2264,7 +2264,7 @@ def union(self, other): other = other.astype('O') return this.union(other) - # TODO: setops-refactor, clean all this up + # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values else: @@ -2357,7 +2357,7 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) - # TODO: setops-refactor, clean all this up + # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self): lvals = self._ndarray_values else: From 3af8a21ea0e13ba5fc73db464f6e327552c71b0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 05:54:27 -0600 Subject: [PATCH 067/119] Override nbytes --- pandas/core/indexes/datetimes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 689610af7603f..cc9ce1f3fd5eb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -708,6 +708,13 @@ def shape(self): # for TZ-aware return self._ndarray_values.shape + @property + def nbytes(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.nbytes + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" From 1e8e87e7ed20d07f422fd7b518b33f3c0fbc0512 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 06:42:01 -0600 Subject: [PATCH 068/119] Remove unused change --- pandas/core/indexes/base.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 47ded9c6f4cd2..281618ffefef4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2004,11 +2004,6 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): if is_categorical_dtype(values.dtype): values = np.array(values) - elif isinstance(values, ExtensionArray): - # This is still un-exercised within pandas, since all our - # extension dtypes have custom indexes. - values = values._formatting_values() - elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) From 0f5e4f06478a1ed5a956a33220a2114399551377 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 06:49:35 -0600 Subject: [PATCH 069/119] Docs --- pandas/core/arrays/base.py | 36 +++++++++++++++++++++++++----------- pandas/core/indexing.py | 6 +++--- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3646a045fa465..e9d56a0e95461 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -204,6 +204,25 @@ def isna(self): """ raise AbstractMethodError(self) + def value_counts(self, dropna=True): + """Compute a histogram of the counts of non-null values. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN + + Returns + ------- + value_counts : Series + """ + from pandas import value_counts + + if dropna: + self = self[~self.isna()] + + return value_counts(np.array(self)) + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ @@ -235,9 +254,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): Examples -------- - Suppose the extension array somehow backed by a NumPy array and that - the underlying structured array is stored as ``self.data``. Then - ``take`` may be written as + Suppose the extension array is backed by a NumPy array stored as + ``self.data``. Then ``take`` may be written as .. code-block:: python @@ -246,6 +264,10 @@ def take(self, indexer, allow_fill=True, fill_value=None): result = self.data.take(indexer) result[mask] = self._fill_value return type(self)(result) + + See Also + -------- + numpy.take """ raise AbstractMethodError(self) @@ -305,14 +327,6 @@ def _can_hold_na(self): """ return True - def value_counts(self, dropna=True): - from pandas import value_counts - - if dropna: - self = self[~self.isna()] - - return value_counts(np.array(self)) - @property def _ndarray_values(self): # type: () -> np.ndarray diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1d07900a4d5df..50f3c7a6b3d3d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,9 +618,9 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): - # TODO: ExtensionBlock.setitem this causes issues with setting - # for extensionarrays that store dicts. Need to decide if it's - # worth supporting that or now + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): From c4dab88c29b72a4efb3a4cee7df210cdf9555361 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 08:56:33 -0600 Subject: [PATCH 070/119] Test cleanpu --- pandas/core/internals.py | 2 +- pandas/tests/extension/base.py | 7 +--- pandas/tests/extension/test_categorical.py | 1 - pandas/tests/extension/test_json.py | 40 ---------------------- 4 files changed, 2 insertions(+), 48 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fffbe18d3008c..e48c4202a7da8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3473,7 +3473,7 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, else: align_keys = [] - # TODO: may interfere with ExtensionBlock.setitem for blocks + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks # with a .values attribute. aligned_args = dict((k, kwargs[k]) for k in align_keys diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index dc9bca653e6f3..51d9da1fe8bab 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -398,17 +398,12 @@ def test_isna(self, data_missing): expected = pd.Series(expected) tm.assert_series_equal(result, expected) - def test_dropna(self, data_missing): - result = pd.Series(data_missing).dropna() - expected = pd.Series(data_missing).iloc[[1]] - tm.assert_series_equal(result, expected) - def test_align(self, data): a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - # TODO: assumes that the ctor can take a list of scalars of the type + # Assumes that the ctor can take a list of scalars of the type e1 = pd.Series(type(data)(list(a) + [data._fill_value])) e2 = pd.Series(type(data)([data._fill_value] + list(b))) tm.assert_series_equal(r1, e1) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 237963bc38415..402c53706294b 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -50,7 +50,6 @@ def test_align(self, data): b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - # TODO: assumes that the ctor can take a list of scalars of the type e1 = pd.Series(type(data)(list(a) + [data._fill_value], dtype=data.dtype)) e2 = pd.Series(type(data)([data._fill_value] + list(b), diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/test_json.py index 515272a4850f9..6d2d227a709fe 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/test_json.py @@ -133,43 +133,3 @@ def na_cmp(self): @pytest.mark.skip(reason="Unhashable") def test_value_counts(self, all_data, dropna): pass - - # @pytest.mark.xfail(reason="Difficulty setting sized objects.") - # def test_set_scalar(self): - # pass - # - - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_loc_scalar_mixed(self): - # This fails on an np.ndarary(dict) call in _setitem_with_indexer - pass - - # @pytest.mark.xfail(reason="Difficulty setting sized objects.") - # def test_set_loc_scalar_single(self): - # pass - # - - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_loc_scalar_multiple_homogoneous(self): - # This fails in _setitem_with_indexer with a - # ValueError: Must have equal len keys and value when setting with - # and iterable - pass - - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_iloc_scalar_mixed(self): - # This fails in _setitem_with_indexer with a - # ValueError: Must have equal len keys and value when setting with an - # iterable - pass - - # @pytest.mark.xfail(reason="Difficulty setting sized objects.") - # def test_set_iloc_scalar_single(self): - # pass - # - @pytest.mark.xfail(reason="Difficulty setting sized objects.") - def test_set_iloc_scalar_multiple_homogoneous(self): - # this fails in _setitem_with_indexer with a - # ValueError: Must have equal len keys and value when setting with an - # iterable - pass From a312ba5c59c2e96854a286bde74d7fd4562afbf8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 11:10:22 -0600 Subject: [PATCH 071/119] Always set PANDAS_TESTING_MODE --- .travis.yml | 7 ++++--- circle.yml | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4cbe7f86bd2fa..0129582acdefa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,6 +20,7 @@ env: # cd pandas-dev/pandas # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" + - PANDAS_TESTING_MODE: "deprecate" git: # for cloning @@ -55,7 +56,7 @@ matrix: - JOB="3.5_CONDA_BUILD_TEST" TEST_ARGS="--skip-slow --skip-network" CONDA_BUILD_TEST=true - dist: trusty env: - - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" CONDA_FORGE=true COVERAGE=true + - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" CONDA_FORGE=true COVERAGE=true # In allow_failures - dist: trusty env: @@ -71,7 +72,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" # In allow_failures - dist: trusty env: @@ -96,7 +97,7 @@ matrix: - xsel - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - JOB="3.6_ASV" ASV=true diff --git a/circle.yml b/circle.yml index 9d49145af54e3..dd322c80d73a0 100644 --- a/circle.yml +++ b/circle.yml @@ -2,6 +2,7 @@ machine: environment: # these are globally set MINICONDA_DIR: /home/ubuntu/miniconda3 + PANDAS_TESTING_MODE: deprecate database: From 758689feb26851a0cdddef61f7b0227c4b23ad20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 15:44:20 -0600 Subject: [PATCH 072/119] Revert "Always set PANDAS_TESTING_MODE" This reverts commit a312ba5c59c2e96854a286bde74d7fd4562afbf8. --- .travis.yml | 7 +++---- circle.yml | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0129582acdefa..4cbe7f86bd2fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,6 @@ env: # cd pandas-dev/pandas # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" - - PANDAS_TESTING_MODE: "deprecate" git: # for cloning @@ -56,7 +55,7 @@ matrix: - JOB="3.5_CONDA_BUILD_TEST" TEST_ARGS="--skip-slow --skip-network" CONDA_BUILD_TEST=true - dist: trusty env: - - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" CONDA_FORGE=true COVERAGE=true + - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" CONDA_FORGE=true COVERAGE=true # In allow_failures - dist: trusty env: @@ -72,7 +71,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" # In allow_failures - dist: trusty env: @@ -97,7 +96,7 @@ matrix: - xsel - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - dist: trusty env: - JOB="3.6_ASV" ASV=true diff --git a/circle.yml b/circle.yml index dd322c80d73a0..9d49145af54e3 100644 --- a/circle.yml +++ b/circle.yml @@ -2,7 +2,6 @@ machine: environment: # these are globally set MINICONDA_DIR: /home/ubuntu/miniconda3 - PANDAS_TESTING_MODE: deprecate database: From 02c3d401771a308b88b1b5d98827c1bb489f223b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 16:31:18 -0600 Subject: [PATCH 073/119] Explicitly catch warnings or not --- pandas/tests/io/test_parquet.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 11cbea8ce6331..7434e58610a34 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -154,10 +154,21 @@ def check_round_trip(df, engine=None, path=None, write_kwargs['engine'] = engine read_kwargs['engine'] = engine + should_warn = (engine == 'pyarrow' and + pyarrow.__version__ <= LooseVersion("0.8.0") and + any(pd.api.types.is_datetime64tz_dtype(dtype) + for dtype in df.dtypes)) + + if should_warn: + warning_type = DeprecationWarning + else: + warning_type = None + def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) - with catch_warnings(record=True): + with tm.assert_produces_warning(warning_type, + check_stacklevel=False): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names) From 9e17037cfb914f715a136df19995e98aa4449ede Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 17:08:50 -0600 Subject: [PATCH 074/119] fastparquet warnings --- pandas/tests/io/test_parquet.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7434e58610a34..01446962dccef 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -154,12 +154,19 @@ def check_round_trip(df, engine=None, path=None, write_kwargs['engine'] = engine read_kwargs['engine'] = engine - should_warn = (engine == 'pyarrow' and - pyarrow.__version__ <= LooseVersion("0.8.0") and - any(pd.api.types.is_datetime64tz_dtype(dtype) - for dtype in df.dtypes)) - - if should_warn: + if (engine == 'pyarrow' and + pyarrow.__version__ <= LooseVersion("0.8.0") and + any(pd.api.types.is_datetime64tz_dtype(dtype) + for dtype in df.dtypes)): + # Use of deprecated fastpath in make_block + warning_type = DeprecationWarning + elif (engine == 'fastparquet' and + fastparquet.__version__ <= LooseVersion("0.1.4") and + df.select_dtypes(['bool', 'object']) + .isin([True, False]).any().any() + and (path is None or not path.startswith('s3://'))): + # use of deprecated np.fromstring for boolean columns + # https://github.com/dask/fastparquet/issues/302 warning_type = DeprecationWarning else: warning_type = None From 4599db453346e09ffdb84c5289e343d79213aed0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Feb 2018 06:31:45 -0600 Subject: [PATCH 075/119] Unicode literals strikes again. Only catch fp warning for newer numpy --- pandas/tests/io/test_parquet.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 01446962dccef..b7ee42d6d66f5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -155,17 +155,23 @@ def check_round_trip(df, engine=None, path=None, read_kwargs['engine'] = engine if (engine == 'pyarrow' and - pyarrow.__version__ <= LooseVersion("0.8.0") and + LooseVersion(pyarrow.__version__) <= LooseVersion("0.8.0") and any(pd.api.types.is_datetime64tz_dtype(dtype) for dtype in df.dtypes)): # Use of deprecated fastpath in make_block + # Deprecated in pandas 0.23 and removed in pyarrow 0.9 + # Remove this when all pyarrow builds >= 0.9 warning_type = DeprecationWarning elif (engine == 'fastparquet' and - fastparquet.__version__ <= LooseVersion("0.1.4") and + LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and + LooseVersion(np.__version__) >= LooseVersion("1.14.0") and df.select_dtypes(['bool', 'object']) - .isin([True, False]).any().any() - and (path is None or not path.startswith('s3://'))): + .isin([True, False]).any().any() and + (path is None or not path.startswith('s3://'))): # use of deprecated np.fromstring for boolean columns + # Deprecated in numpy 1.14 + # Used in fastparquet <= 0.1.4 + # Remove when all fastparquet builds >= 0.1.5 # https://github.com/dask/fastparquet/issues/302 warning_type = DeprecationWarning else: From d34d9cadd8526adf06dda9ff53b2104a13530d4e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Feb 2018 13:33:10 -0600 Subject: [PATCH 076/119] Restore circle env var --- circle.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/circle.yml b/circle.yml index 9d49145af54e3..dd322c80d73a0 100644 --- a/circle.yml +++ b/circle.yml @@ -2,6 +2,7 @@ machine: environment: # these are globally set MINICONDA_DIR: /home/ubuntu/miniconda3 + PANDAS_TESTING_MODE: deprecate database: From 29d252827514f5c14433f8b874a5a41d5a22372f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Feb 2018 14:55:54 -0600 Subject: [PATCH 077/119] More parquet test catching --- pandas/tests/io/test_parquet.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b7ee42d6d66f5..4ba7336a986e4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -166,14 +166,19 @@ def check_round_trip(df, engine=None, path=None, LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and LooseVersion(np.__version__) >= LooseVersion("1.14.0") and df.select_dtypes(['bool', 'object']) - .isin([True, False]).any().any() and - (path is None or not path.startswith('s3://'))): + .isin([True, False]).any().any()): # use of deprecated np.fromstring for boolean columns # Deprecated in numpy 1.14 # Used in fastparquet <= 0.1.4 # Remove when all fastparquet builds >= 0.1.5 # https://github.com/dask/fastparquet/issues/302 warning_type = DeprecationWarning + elif (engine == 'fastparquet' and + LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and + any(pd.api.types.is_bool_dtype(df[col]) for col in df.columns)): + # Use of deprecated `dtype` in `make_block` that's hit only for + # bool dtypes with no Nones. + warning_type = DeprecationWarning else: warning_type = None @@ -248,7 +253,16 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) - result = read_parquet(path, engine=fp) + if (LooseVersion(fastparquet.__version__) <= LooseVersion('0.1.4') and + LooseVersion(np.__version__) >= LooseVersion('1.14.0')): + # fastparquet used np.fromstring, deprecated in numpy 1.14.0 + expected_warning = DeprecationWarning + else: + expected_warning = None + + with tm.assert_produces_warning(expected_warning): + result = read_parquet(path, engine=fp) + tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=['a', 'd']) From 412c951c6e9e2af49866da2ce5f3cf9015bf88a7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Feb 2018 16:55:31 -0600 Subject: [PATCH 078/119] No stacklevel --- pandas/tests/io/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4ba7336a986e4..9ba2c92844995 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -260,7 +260,8 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): else: expected_warning = None - with tm.assert_produces_warning(expected_warning): + with tm.assert_produces_warning(expected_warning, + check_stacklevel=False): result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) From 78834f1c165a2a7ffec5a06abc8972bb5631390c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Feb 2018 06:58:51 -0600 Subject: [PATCH 079/119] Lower bound on FP --- pandas/tests/io/test_parquet.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9ba2c92844995..5c2553979cc6d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -154,6 +154,15 @@ def check_round_trip(df, engine=None, path=None, write_kwargs['engine'] = engine read_kwargs['engine'] = engine + fastparquet_make_block_dtype = ( + # Use of deprecated `dtype` in `make_block` that's hit only for + # bool dtypes with no Nones. + engine == 'fastparquet' and + LooseVersion("0.1.1") < LooseVersion(fastparquet.__version__) <= + LooseVersion("0.1.4") and + any(pd.api.types.is_bool_dtype(df[col]) for col in df.columns) + ) + if (engine == 'pyarrow' and LooseVersion(pyarrow.__version__) <= LooseVersion("0.8.0") and any(pd.api.types.is_datetime64tz_dtype(dtype) @@ -173,11 +182,7 @@ def check_round_trip(df, engine=None, path=None, # Remove when all fastparquet builds >= 0.1.5 # https://github.com/dask/fastparquet/issues/302 warning_type = DeprecationWarning - elif (engine == 'fastparquet' and - LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and - any(pd.api.types.is_bool_dtype(df[col]) for col in df.columns)): - # Use of deprecated `dtype` in `make_block` that's hit only for - # bool dtypes with no Nones. + elif fastparquet_make_block_dtype: warning_type = DeprecationWarning else: warning_type = None From f8eac55e3f1ece94bc4a173cd84874faeb73fc5a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Feb 2018 08:27:22 -0600 Subject: [PATCH 080/119] Exact bound for FP --- pandas/tests/io/test_parquet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5c2553979cc6d..69b651839f80a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,8 +158,7 @@ def check_round_trip(df, engine=None, path=None, # Use of deprecated `dtype` in `make_block` that's hit only for # bool dtypes with no Nones. engine == 'fastparquet' and - LooseVersion("0.1.1") < LooseVersion(fastparquet.__version__) <= - LooseVersion("0.1.4") and + LooseVersion(fastparquet.__version__) == LooseVersion("0.1.4") and any(pd.api.types.is_bool_dtype(df[col]) for col in df.columns) ) @@ -171,6 +170,9 @@ def check_round_trip(df, engine=None, path=None, # Deprecated in pandas 0.23 and removed in pyarrow 0.9 # Remove this when all pyarrow builds >= 0.9 warning_type = DeprecationWarning + # elif (engine == 'fastparquet' and + # LooseVersion(fastparquet.__version__) == LooseVersion('0.1.3')): + # warning_type = DeprecationWarning elif (engine == 'fastparquet' and LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and LooseVersion(np.__version__) >= LooseVersion("1.14.0") and From f09c86334493cfe57b994547f3fdacb2afbc9f4c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Feb 2018 10:37:44 -0600 Subject: [PATCH 081/119] Don't use fastpath for ExtensionBlock make_block --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e48c4202a7da8..b42138343de19 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4867,7 +4867,7 @@ def form_blocks(arrays, names, axes): for i, _, array in items_dict['ExtensionBlock']: external_blocks.append( make_block(array, klass=ExtensionBlock, - fastpath=True, placement=[i])) + placement=[i])) blocks.extend(external_blocks) if len(extra_locs): From cedb63d5b53c80c13d712dd6152acaefb5622801 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 09:12:04 -0600 Subject: [PATCH 082/119] Consistently use _values --- pandas/core/algorithms.py | 2 +- pandas/core/series.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 427ec5af270bb..099a1411ebae7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -546,7 +546,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if is_extension_array_dtype(values) or is_sparse(values): # handle Categorical and sparse, - result = Series(values).values.value_counts(dropna=dropna) + result = Series(values)._values.value_counts(dropna=dropna) result.name = name counts = result.values diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e36b95c95120..b8338dcfdec63 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2568,8 +2568,8 @@ def _reindex_indexer(self, new_index, indexer, copy): return self # be subclass-friendly - if isinstance(self.values, ExtensionArray): - new_values = self.values.take(indexer) + if isinstance(self._values, ExtensionArray): + new_values = self._values.take(indexer) else: new_values = algorithms.take_1d(self.get_values(), indexer) From cae2c26c0c17b91c466d94d0ce7e0483598ad8fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 09:30:24 -0600 Subject: [PATCH 083/119] TST: Additional constructor tests --- pandas/tests/extension/base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index 51d9da1fe8bab..3639c042a9336 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -133,8 +133,13 @@ def test_series_constructor(self, data): assert isinstance(result._data.blocks[0], ExtensionBlock) assert result._data.blocks[0].values is data + # Series[EA] is unboxed / boxed correctly + result2 = pd.Series(result) + assert result2.dtype == data.dtype + assert isinstance(result2._data.blocks[0], ExtensionBlock) + @pytest.mark.parametrize("from_series", [True, False]) - def dataframe_constructor(self, data, from_series): + def test_dataframe_constructor_from_dict(self, data, from_series): if from_series: data = pd.Series(data) result = pd.DataFrame({"A": data}) @@ -142,6 +147,12 @@ def dataframe_constructor(self, data, from_series): assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) + def test_dataframe_from_series(self, data): + result = pd.DataFrame(pd.Series(data)) + assert result.dtypes[0] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + @pytest.mark.xfail(reason="GH-19342") def test_series_given_mismatched_index_raises(self, data): msg = 'Wrong number of items passed 3, placement implies 4' From 808809612512fcfe960f081be05096ab5f679486 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 09:30:54 -0600 Subject: [PATCH 084/119] CLN: de-nested a bit --- pandas/core/dtypes/missing.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 002839af6daf2..170fd518f55a6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -137,22 +137,20 @@ def _isna_ndarraylike(obj): else: values = obj result = values.isna() + elif is_interval_dtype(values): + # TODO(IntervalArray): remove this if block + from pandas import IntervalIndex + result = IntervalIndex(obj).isna() elif is_string_dtype(dtype): - if is_interval_dtype(values): - # TODO(IntervalArray): remove this if block - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() - else: - - # Working around NumPy ticket 1542 - shape = values.shape + # Working around NumPy ticket 1542 + shape = values.shape - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj(values.ravel()) - result[...] = vec.reshape(shape) + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = libmissing.isnaobj(values.ravel()) + result[...] = vec.reshape(shape) elif needs_i8_conversion(obj): # this is the NaT pattern From 8aed325b43e3f2c11cf58504b9755c7a836923a2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 13:38:46 -0600 Subject: [PATCH 085/119] _fill_value handling --- pandas/core/arrays/base.py | 24 ++++++++++++----------- pandas/core/arrays/categorical.py | 2 +- pandas/core/internals.py | 11 +++++------ pandas/tests/categorical/test_missing.py | 25 ++++++++++++++++++++++-- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e9d56a0e95461..9e416eb8d064c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -27,7 +27,7 @@ class ExtensionArray(object): * copy * _concat_same_type - Some additional methods are required to satisfy pandas' internal, private + Some additional methods are available to satisfy pandas' internal, private block API. * _can_hold_na @@ -98,16 +98,17 @@ def __setitem__(self, key, value): When called from, e.g. ``Series.__setitem__``, ``key`` will always be an ndarray of integers. value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - ExtensionArrays may + value or values to be set of ``key``. Notes ----- This method is not required to satisfy the interface. If an ExtensionArray chooses to implement __setitem__, then some semantics - should be observed. + should be observed: * Setting multiple values : ExtensionArrays should support setting - multiple values at once, ``key`` will be a sequence of integers. + multiple values at once, ``key`` will be a sequence of integers and + ``value`` will be a same-length sequence. * Broadcasting : For a sequence ``key`` and a scalar ``value``, each position in ``key`` should be set to ``value``. @@ -116,9 +117,6 @@ def __setitem__(self, key, value): example, a string like ``'2018-01-01'`` is coerced to a datetime when setting on a datetime64ns array. In general, if the ``__init__`` method coerces that value, then so should ``__setitem__``. - - When called from, e.g. ``Series.__setitem__``, ``key`` will always - be an ndarray of positions. """ raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') @@ -240,8 +238,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. fill_value : any, default None - Fill value to replace -1 values with. By default, this uses - the missing value sentinel for this type, ``self._fill_value``. + Fill value to replace -1 values with. If applicable, this should + use the sentinel missing value for this type. Notes ----- @@ -262,7 +260,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 result = self.data.take(indexer) - result[mask] = self._fill_value + result[mask] = self._fill_value # NA for this type return type(self)(result) See Also @@ -292,7 +290,11 @@ def copy(self, deep=False): @property def _fill_value(self): # type: () -> Any - """The missing value for this type, e.g. np.nan""" + """The missing value for this type, e.g. np.nan. Default None. + + This is not currently used by pandas directly. It is used in the + provided test suite for extension arrays. + """ return None def _formatting_values(self): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d1b231b21f496..784844256d79d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2143,7 +2143,7 @@ def _can_hold_na(self): @property def _fill_value(self): - return np.nan + return self.categories._na_value @classmethod def _concat_same_type(self, to_concat): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b42138343de19..7127da1ace5cb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1878,7 +1878,7 @@ def _holder(self): @property def _can_hold_na(self): - # The default ExtensionBlock._can_hold_na is True + # The default ExtensionArray._can_hold_na is True return self._holder._can_hold_na @property @@ -4862,12 +4862,11 @@ def form_blocks(arrays, names, axes): if len(items_dict['ExtensionBlock']): - external_blocks = [] + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict['ExtensionBlock'] + ] - for i, _, array in items_dict['ExtensionBlock']: - external_blocks.append( - make_block(array, klass=ExtensionBlock, - placement=[i])) blocks.extend(external_blocks) if len(extra_locs): diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 79758dee5cfda..061f7f91faae9 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- - import numpy as np +import pytest import pandas.util.testing as tm -from pandas import (Categorical, Index, isna) +from pandas import (Categorical, Index, DatetimeIndex, isna, NaT, + TimedeltaIndex) from pandas.compat import lrange from pandas.core.dtypes.dtypes import CategoricalDtype @@ -53,3 +54,23 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) + + @pytest.mark.parametrize('arr', [ + DatetimeIndex(['2017', '2018']), + DatetimeIndex(['2017', '2018'], tz='US/Central'), + DatetimeIndex(['2017', '2018'], tz='US/Central'), + TimedeltaIndex(['10s', '201s']), + ]) + def test_fill_value_nat(self, arr): + cat = Categorical(arr) + assert cat._fill_value is NaT + + @pytest.mark.parametrize('arr', [ + [0, 1], + [True, False], + ['a', 'b'], + [0.0, 1.0], + ]) + def test_fill_value_nan(self, arr): + cat = Categorical(arr) + assert isna(cat._fill_value) From 453728a09e38380dfc9cecc1b9a6c68a8d4f1384 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 14:10:42 -0600 Subject: [PATCH 086/119] Handle user provided dtype in constructors. When the dtype matches, we allow it to proceed. When the dtype would require coercion, we raise. --- pandas/core/series.py | 21 ++++++++++---- pandas/tests/extension/test_decimal.py | 38 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index b8338dcfdec63..9e98908f601c8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -21,6 +21,7 @@ is_integer, is_integer_dtype, is_float_dtype, is_extension_type, + is_extension_array_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -208,13 +209,15 @@ def __init__(self, data=None, index=None, dtype=None, name=None, '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') - elif isinstance(data, Categorical): + + elif is_extension_array_dtype(data) and dtype is not None: # GH12574: Allow dtype=category only, otherwise error - if ((dtype is not None) and - not is_categorical_dtype(dtype)): - raise ValueError("cannot specify a dtype with a " - "Categorical unless " - "dtype='category'") + if not data.dtype.is_dtype(dtype): + raise ValueError("Cannot specify a dtype '{}' with an " + "extension array of a different " + "dtype ('{}').".format(dtype, + data.dtype)) + elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) @@ -3206,6 +3209,12 @@ def _try_cast(arr, take_fast_path): elif isinstance(data, ExtensionArray): subarr = data + if dtype is not None and not data.dtype.is_dtype(dtype): + msg = ("Cannot coerce extension array to dtype '{typ}'. " + "Do the coercion before passing to the constructor " + "instead.".format(typ=dtype)) + raise ValueError(msg) + if copy: subarr = data.copy() return subarr diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index 687e645825a75..62f7966455b48 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -141,3 +141,41 @@ def test_value_counts(self, all_data, dropna): expected = pd.Series(other).value_counts(dropna=dropna).sort_index() tm.assert_series_equal(result, expected) + + +def test_series_constructor_with_dtype_coercion_raises(): + xpr = ("Cannot coerce data to extension dtype 'decimal'. Pass the " + "extension array for 'decimal' directly instead.") + with tm.assert_raises_regex(ValueError, xpr): + pd.Series([0, 1, 2], dtype=DecimalDtype()) + + +def test_series_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + result = pd.Series(arr, dtype=DecimalDtype()) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) + + +def test_series_constructor_with_different_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + xpr = "Cannot specify a dtype 'int64' .* \('decimal'\)." + + with tm.assert_raises_regex(ValueError, xpr): + pd.Series(arr, dtype='int64') + + +def test_dataframe_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) + expected = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_constructor_with_different_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + xpr = "Cannot coerce extension array to dtype 'int64'. " + with tm.assert_raises_regex(ValueError, xpr): + pd.DataFrame({"A": arr}, dtype='int64') From cc13c8dd66513dce6cee9e69bb9623579c120cee Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 14:22:25 -0600 Subject: [PATCH 087/119] Document ExtensionBlock._maybe_coerce_values Also changes to use _values as we should --- pandas/core/internals.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7127da1ace5cb..986a6674128be 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1865,10 +1865,21 @@ def __init__(self, values, placement, ndim=None): super(ExtensionBlock, self).__init__(values, placement, ndim) def _maybe_coerce_values(self, values): - # Unboxes Series / Index - # Doesn't change any underlying dtypes. + """Unbox to an extension array. + + This will unbox an ExtensionArray stored in an Index or Series. + ExtensionArrays pass through. No dtype coercion is done. + + Parameters + ---------- + values : Index, Series, ExtensionArray + + Returns + ------- + ExtensionArray + """ if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values.values + values = values._values return values @property From f90ac0732915d40f213f7a9571606acfafec355b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 14:56:21 -0600 Subject: [PATCH 088/119] Created ABCExtensionArray --- pandas/core/arrays/base.py | 1 + pandas/core/dtypes/base.py | 2 +- pandas/core/dtypes/generic.py | 2 ++ pandas/core/dtypes/missing.py | 7 +++---- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9e416eb8d064c..8db518552559f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -54,6 +54,7 @@ class ExtensionArray(object): the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. """ + _typ = 'extension' # For pandas.core.dtypes.generic.ABCExtensionArray # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2f071a3b3cf71..17d375e67808b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -122,7 +122,7 @@ def is_dtype(cls, dtype): 1. ``cls.construct_from_string(dtype)`` is an instance of ``cls``. 2. ``dtype`` is an object and is an instance of ``cls`` - 3. 'dtype' is a class and is ``cls`` or a subclass of ``cls``. + 3. ``dtype`` is a class and is ``cls`` or a subclass of ``cls``. """ if isinstance(dtype, str): try: diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index b032cb6f14d4c..b841322bf93e1 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -57,6 +57,8 @@ def _check(cls, inst): ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) +ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", + ("extension",)) class _ABCGeneric(type): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 170fd518f55a6..b4a05a24aabc9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -5,7 +5,8 @@ from pandas._libs import lib, missing as libmissing from pandas._libs.tslib import NaT, iNaT from .generic import (ABCMultiIndex, ABCSeries, - ABCIndexClass, ABCGeneric) + ABCIndexClass, ABCGeneric, + ABCExtensionArray) from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, @@ -53,15 +54,13 @@ def isna(obj): def _isna_new(obj): - from ..arrays import ExtensionArray - if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, - ExtensionArray)): + ABCExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) From 4a03b26170b39b41727a64d6cadd43652685ce01 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 14:56:44 -0600 Subject: [PATCH 089/119] TST: Tests for is_object_dtype and is_string_dtype and EAs --- pandas/tests/extension/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index 3639c042a9336..eb55c2c8c28f6 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -43,6 +43,12 @@ def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True + def test_is_not_string_type(self, dtype): + return not pd.api.types.is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + return not pd.api.types.is_object_dtype(dtype) + class BaseArrayTests(object): """Base class for extension array classes. From 635223fc006a3ade4a3e8cdb01b51043ebb74983 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 21:03:49 -0600 Subject: [PATCH 090/119] fixup! Handle user provided dtype in constructors. --- pandas/core/series.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e98908f601c8..7e98ff4deb5d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3152,6 +3152,13 @@ def _sanitize_array(data, index, dtype=None, copy=False, if dtype is not None: dtype = pandas_dtype(dtype) + if is_extension_array_dtype(dtype) and not is_extension_array_dtype(data): + # Just check for any extension dtype data here. We validatate that + # the exact types match later. + raise ValueError("Cannot coerce data to extension dtype '{type}'. " + "Pass the extension array for '{type}' " + "directly instead.".format(type=dtype)) + if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): From cf423a72b9d345d69c22bc8e91a3fe20845ea0d8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 21:34:55 -0600 Subject: [PATCH 091/119] Doc for setitem --- pandas/core/arrays/base.py | 10 ++++++++-- pandas/core/frame.py | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8db518552559f..3d32adbc35f53 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -96,8 +96,14 @@ def __setitem__(self, key, value): Parameters ---------- key : int or ndarray - When called from, e.g. ``Series.__setitem__``, ``key`` will - always be an ndarray of integers. + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 86b6405a2617a..5eb729ffd77b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5638,6 +5638,8 @@ def count(self, axis=0, level=None, numeric_only=False): result = Series(0, index=frame._get_agg_axis(axis)) else: if frame._is_mixed_type or frame._data.any_extension_types: + # the or any_extension_types is really only hit for single- + # column frames with an extension array result = notna(frame).sum(axis=axis) else: counts = notna(frame.values).sum(axis=axis) From 2d1a66c4ff44ba88210533e0e281c6fb411ef9fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 22:07:20 -0600 Subject: [PATCH 092/119] Split base tests --- pandas/tests/extension/base.py | 243 +++++---------------- pandas/tests/extension/conftest.py | 41 ++++ pandas/tests/extension/test_categorical.py | 64 ++++-- pandas/tests/extension/test_decimal.py | 54 +++-- pandas/tests/extension/test_json.py | 60 +++-- 5 files changed, 214 insertions(+), 248 deletions(-) create mode 100644 pandas/tests/extension/conftest.py diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index eb55c2c8c28f6..fb60b79e75ac5 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -1,5 +1,3 @@ -import operator - import numpy as np import pytest @@ -14,11 +12,6 @@ class BaseDtypeTests(object): """Base class for ExtensionDtype classes""" - @pytest.fixture - def dtype(self): - """A fixture providing the ExtensionDtype to validate.""" - raise NotImplementedError - def test_name(self, dtype): assert isinstance(dtype.name, str) @@ -50,46 +43,8 @@ def test_is_not_object_type(self, dtype): return not pd.api.types.is_object_dtype(dtype) -class BaseArrayTests(object): - """Base class for extension array classes. - - Subclasses should implement the following fixtures - - * data - * data_missing - """ - - # ------------------------------------------------------------------------ - # Fixtures - # ------------------------------------------------------------------------ - @pytest.fixture - def data(self): - """Length-100 array for this type.""" - raise NotImplementedError - - @pytest.fixture - def data_missing(self): - """Length-2 array with [NA, Valid]""" - raise NotImplementedError - - @pytest.fixture(params=['data', 'data_missing']) - def all_data(self, request, data, data_missing): - if request.param == 'data': - return data - elif request.param == 'data_missing': - return data_missing - - @pytest.fixture - def na_cmp(self): - """Binary operator for comparing NA values. - - Should return a function of two arguments that returns - True if both arguments are (scalar) NA for your type. - - By defult, uses ``operator.or`` - """ - return operator.is_ - +class BaseInterfaceTests(object): + """Tests that the basic interface is satisfied.""" # ------------------------------------------------------------------------ # Interface # ------------------------------------------------------------------------ @@ -128,9 +83,14 @@ def test_dtype_name_in_info(self, data): result = buf.getvalue() assert data.dtype.name in result - # ------------------------------------------------------------------------ - # Constructors - # ------------------------------------------------------------------------ + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) + + +class BaseConstructorsTests(object): def test_series_constructor(self, data): result = pd.Series(data) @@ -167,10 +127,9 @@ def test_series_given_mismatched_index_raises(self, data): assert m.match(msg) - # ------------------------------------------------------------------------ - # Reshaping - # ------------------------------------------------------------------------ +class BaseReshapingTests(object): + """Tests for reshaping and concatenation.""" def test_concat(self, data): result = pd.concat([ pd.Series(data), @@ -180,9 +139,20 @@ def test_concat(self, data): assert result.dtype == data.dtype assert isinstance(result._data.blocks[0], ExtensionBlock) - # ------------------------------------------------------------------------ - # Indexing - getting - # ------------------------------------------------------------------------ + def test_align(self, data): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [data._fill_value])) + e2 = pd.Series(type(data)([data._fill_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + +class BaseGetitemTests(object): + """Tests for ExtensionArray.__getitem__.""" def test_iloc_series(self, data): ser = pd.Series(data) @@ -246,12 +216,6 @@ def test_loc_frame(self, data): result = df.loc[:3, 'A'] tm.assert_series_equal(result, expected) - def test_is_extension_array_dtype(self, data): - assert is_extension_array_dtype(data) - assert is_extension_array_dtype(data.dtype) - assert is_extension_array_dtype(pd.Series(data)) - assert isinstance(data.dtype, ExtensionDtype) - def test_getitem_scalar(self, data): result = data[0] assert isinstance(result, data.dtype.type) @@ -301,107 +265,8 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] - # ------------------------------------------------------------------------ - # Indexing - Setting - # ------------------------------------------------------------------------ - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_setitem_scalar(self, data): - arr = pd.Series(data) - arr[0] = data[1] - assert arr[0] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_setitem_sequence(self, data): - arr = pd.Series(data) - original = data.copy() - - arr[[0, 1]] = [data[1], data[0]] - assert arr[0] == original[1] - assert arr[1] == original[0] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_setitem_sequence_broadcasts(self, data): - arr = pd.Series(data) - - arr[[0, 1]] = data[2] - assert arr[0] == data[2] - assert arr[1] == data[2] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - @pytest.mark.parametrize('setter', ['loc', 'iloc']) - def test_set_scalar(self, data, setter): - arr = pd.Series(data) - setter = getattr(arr, setter) - operator.setitem(setter, 0, data[1]) - assert arr[0] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_loc_scalar_mixed(self, data): - df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - df.loc[0, 'B'] = data[1] - assert df.loc[0, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_loc_scalar_single(self, data): - df = pd.DataFrame({"B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_loc_scalar_multiple_homogoneous(self, data): - df = pd.DataFrame({"A": data, "B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_iloc_scalar_mixed(self, data): - df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - df.iloc[0, 1] = data[1] - assert df.loc[0, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_iloc_scalar_single(self, data): - df = pd.DataFrame({"B": data}) - df.iloc[10, 0] = data[1] - assert df.loc[10, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_iloc_scalar_multiple_homogoneous(self, data): - df = pd.DataFrame({"A": data, "B": data}) - df.iloc[10, 1] = data[1] - assert df.loc[10, 'B'] == data[1] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_mask_aligned(self, data): - ser = pd.Series(data) - mask = np.zeros(len(data), dtype=bool) - mask[:2] = True - - ser[mask] = data[5:7] - assert ser[0] == data[5] - assert ser[1] == data[6] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_set_mask_broadcast(self, data): - ser = pd.Series(data) - mask = np.zeros(len(data), dtype=bool) - mask[:2] = True - - ser[mask] = data[10] - assert ser[0] == data[10] - assert ser[1] == data[10] - - @pytest.mark.xfail(reason="ExtensionBlock.__setitem__ not implemented.") - def test_setitem_expand_columns(self, data): - df = pd.DataFrame({"A": data}) - df['B'] = 1 - assert len(df.columns) == 2 - - # ------------------------------------------------------------------------ - # Methods - # ------------------------------------------------------------------------ +class BaseMissingTests(object): def test_isna(self, data_missing): if data_missing._can_hold_na: expected = np.array([True, False]) @@ -415,36 +280,6 @@ def test_isna(self, data_missing): expected = pd.Series(expected) tm.assert_series_equal(result, expected) - def test_align(self, data): - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - - # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(type(data)(list(a) + [data._fill_value])) - e2 = pd.Series(type(data)([data._fill_value] + list(b))) - tm.assert_series_equal(r1, e1) - tm.assert_series_equal(r2, e2) - - @pytest.mark.parametrize('dropna', [True, False]) - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - - tm.assert_series_equal(result, expected) - - def test_count(self, data_missing): - df = pd.DataFrame({"A": data_missing}) - result = df.count(axis='columns') - expected = pd.Series([0, 1]) - tm.assert_series_equal(result, expected) - def test_dropna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.dropna() @@ -470,3 +305,27 @@ def test_dropna_frame(self, data_missing): result = df.dropna() expected = df.iloc[:0] tm.assert_frame_equal(result, expected) + + +class BaseMethodsTests(object): + """Various Series and DataFrame methods.""" + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis='columns') + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py new file mode 100644 index 0000000000000..d49c6184c5494 --- /dev/null +++ b/pandas/tests/extension/conftest.py @@ -0,0 +1,41 @@ +import operator + +import pytest + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + raise NotImplementedError + + +@pytest.fixture +def data(): + """Length-100 array for this type.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + +@pytest.fixture +def na_cmp(): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By defult, uses ``operator.or`` + """ + return operator.is_ diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 402c53706294b..cc22fdc953859 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -7,43 +7,46 @@ import pandas.util.testing as tm from pandas.api.types import CategoricalDtype from pandas import Categorical -from .base import BaseArrayTests, BaseDtypeTests - - -class TestCategoricalDtype(BaseDtypeTests): - @pytest.fixture - def dtype(self): - return CategoricalDtype() +from . import base def make_data(): return np.random.choice(list(string.ascii_letters), size=100) -class TestCategoricalArray(BaseArrayTests): +@pytest.fixture +def dtype(): + return CategoricalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return Categorical(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return Categorical([np.nan, 'A']) + - @pytest.fixture - def data(self): - """Length-100 PeriodArray for semantics test.""" - return Categorical(make_data()) +class TestDtype(base.BaseDtypeTests): + pass - @pytest.fixture - def data_missing(self): - """Length 2 array with [NA, Valid]""" - return Categorical([np.nan, 'A']) +class TestInterface(base.BaseInterfaceTests): @pytest.mark.skip(reason="Memory usage doesn't match") def test_memory_usage(self): # Is this deliberate? pass - @pytest.mark.skip(reason="Backwards compatability") - def test_getitem_scalar(self): - # CategoricalDtype.type isn't "correct" since it should - # be a parent of the elements (object). But don't want - # to break things by changing. - pass +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): def test_align(self, data): # Override to pass through dtype a = data[:3] @@ -57,6 +60,23 @@ def test_align(self, data): tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) + +class TestGetitem(base.BaseGetitemTests): + @pytest.mark.skip(reason="Backwards compatability") + def test_getitem_scalar(self): + # CategoricalDtype.type isn't "correct" since it should + # be a parent of the elements (object). But don't want + # to break things by changing. + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + pass + @pytest.mark.skip(reason="Different value_counts semantics.") def test_value_counts(self, all_data, dropna): pass diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index 62f7966455b48..c574c9556f508 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -11,7 +11,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype -from .base import BaseDtypeTests, BaseArrayTests +from . import base class DecimalDtype(ExtensionDtype): @@ -92,27 +92,39 @@ def make_data(): return [decimal.Decimal(random.random()) for _ in range(100)] -class TestDecimalDtype(BaseDtypeTests): +@pytest.fixture +def dtype(): + return DecimalDtype() - @pytest.fixture - def dtype(self): - return DecimalDtype() +@pytest.fixture +def data(): + return DecimalArray(make_data()) -class TestDecimalArray(BaseArrayTests): - @pytest.fixture - def data(self): - return DecimalArray(make_data()) +@pytest.fixture +def data_missing(): + return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) - @pytest.fixture - def data_missing(self): - return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) - @pytest.fixture - def na_cmp(self): - return lambda x, y: x.is_nan() and y.is_nan() +@pytest.fixture +def na_cmp(): + return lambda x, y: x.is_nan() and y.is_nan() + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): def test_align(self, data): a = data[:3] b = data[2:5] @@ -129,7 +141,17 @@ def test_align(self, data): assert r2[0].is_nan() assert e2[0].is_nan() - @pytest.mark.skip(reason="NaN Sorting") + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.xfail(reason="NaN Sorting") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/test_json.py index 6d2d227a709fe..466b8c191b533 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/test_json.py @@ -13,8 +13,7 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.arrays import ExtensionArray -from .base import BaseArrayTests, BaseDtypeTests - +from . import base pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, reason="Py2 doesn't have a UserDict") @@ -108,28 +107,53 @@ def make_data(): for _ in range(random.randint(0, 10))]) for _ in range(100)] -class TestJSONDtype(BaseDtypeTests): - @pytest.fixture - def dtype(self): - return JSONDtype() +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return JSONArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {'a': 10}]) + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass -class TestJSON(BaseArrayTests): +class TestGetitem(base.BaseGetitemTests): + pass - @pytest.fixture - def data(self): - """Length-100 PeriodArray for semantics test.""" - return JSONArray(make_data()) - @pytest.fixture - def data_missing(self): - """Length 2 array with [NA, Valid]""" - return JSONArray([{}, {'a': 10}]) +class TestMissing(base.BaseMissingTests): + pass - @pytest.fixture - def na_cmp(self): - return operator.eq +class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unhashable") def test_value_counts(self, all_data, dropna): pass From c849865fccbd9f35dbe91fac6c583c6d920d9990 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 22:14:46 -0600 Subject: [PATCH 093/119] Revert test_parquet changes --- circle.yml | 1 - pandas/tests/io/test_parquet.py | 50 ++------------------------------- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/circle.yml b/circle.yml index dd322c80d73a0..9d49145af54e3 100644 --- a/circle.yml +++ b/circle.yml @@ -2,7 +2,6 @@ machine: environment: # these are globally set MINICONDA_DIR: /home/ubuntu/miniconda3 - PANDAS_TESTING_MODE: deprecate database: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 69b651839f80a..11cbea8ce6331 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -154,46 +154,10 @@ def check_round_trip(df, engine=None, path=None, write_kwargs['engine'] = engine read_kwargs['engine'] = engine - fastparquet_make_block_dtype = ( - # Use of deprecated `dtype` in `make_block` that's hit only for - # bool dtypes with no Nones. - engine == 'fastparquet' and - LooseVersion(fastparquet.__version__) == LooseVersion("0.1.4") and - any(pd.api.types.is_bool_dtype(df[col]) for col in df.columns) - ) - - if (engine == 'pyarrow' and - LooseVersion(pyarrow.__version__) <= LooseVersion("0.8.0") and - any(pd.api.types.is_datetime64tz_dtype(dtype) - for dtype in df.dtypes)): - # Use of deprecated fastpath in make_block - # Deprecated in pandas 0.23 and removed in pyarrow 0.9 - # Remove this when all pyarrow builds >= 0.9 - warning_type = DeprecationWarning - # elif (engine == 'fastparquet' and - # LooseVersion(fastparquet.__version__) == LooseVersion('0.1.3')): - # warning_type = DeprecationWarning - elif (engine == 'fastparquet' and - LooseVersion(fastparquet.__version__) <= LooseVersion("0.1.4") and - LooseVersion(np.__version__) >= LooseVersion("1.14.0") and - df.select_dtypes(['bool', 'object']) - .isin([True, False]).any().any()): - # use of deprecated np.fromstring for boolean columns - # Deprecated in numpy 1.14 - # Used in fastparquet <= 0.1.4 - # Remove when all fastparquet builds >= 0.1.5 - # https://github.com/dask/fastparquet/issues/302 - warning_type = DeprecationWarning - elif fastparquet_make_block_dtype: - warning_type = DeprecationWarning - else: - warning_type = None - def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) - with tm.assert_produces_warning(warning_type, - check_stacklevel=False): + with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names) @@ -260,17 +224,7 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) - if (LooseVersion(fastparquet.__version__) <= LooseVersion('0.1.4') and - LooseVersion(np.__version__) >= LooseVersion('1.14.0')): - # fastparquet used np.fromstring, deprecated in numpy 1.14.0 - expected_warning = DeprecationWarning - else: - expected_warning = None - - with tm.assert_produces_warning(expected_warning, - check_stacklevel=False): - result = read_parquet(path, engine=fp) - + result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=['a', 'd']) From c3ec8226ed3bf37361f45b5760cc3a3f51ea9fc5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 22:32:24 -0600 Subject: [PATCH 094/119] API: Removed _fill_value from the interface --- pandas/core/arrays/base.py | 10 --------- pandas/core/arrays/categorical.py | 4 ---- pandas/tests/categorical/test_missing.py | 24 +--------------------- pandas/tests/extension/base.py | 10 ++++----- pandas/tests/extension/conftest.py | 6 ++++++ pandas/tests/extension/test_categorical.py | 11 +++++++--- pandas/tests/extension/test_decimal.py | 15 +++++++++----- pandas/tests/extension/test_json.py | 11 +++++++--- 8 files changed, 38 insertions(+), 53 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3d32adbc35f53..9c7c395ef994f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -32,7 +32,6 @@ class ExtensionArray(object): * _can_hold_na * _formatting_values - * _fill_value This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -294,15 +293,6 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Block-related methods # ------------------------------------------------------------------------ - @property - def _fill_value(self): - # type: () -> Any - """The missing value for this type, e.g. np.nan. Default None. - - This is not currently used by pandas directly. It is used in the - provided test suite for extension arrays. - """ - return None def _formatting_values(self): # type: () -> np.ndarray diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 784844256d79d..bcf9cb7646704 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2141,10 +2141,6 @@ def repeat(self, repeats, *args, **kwargs): def _can_hold_na(self): return True - @property - def _fill_value(self): - return self.categories._na_value - @classmethod def _concat_same_type(self, to_concat): from pandas.core.dtypes.concat import _concat_categorical diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 061f7f91faae9..c8ac6a6ef14f8 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,10 +1,8 @@ # -*- coding: utf-8 -*- import numpy as np -import pytest import pandas.util.testing as tm -from pandas import (Categorical, Index, DatetimeIndex, isna, NaT, - TimedeltaIndex) +from pandas import Categorical, Index, isna from pandas.compat import lrange from pandas.core.dtypes.dtypes import CategoricalDtype @@ -54,23 +52,3 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) - - @pytest.mark.parametrize('arr', [ - DatetimeIndex(['2017', '2018']), - DatetimeIndex(['2017', '2018'], tz='US/Central'), - DatetimeIndex(['2017', '2018'], tz='US/Central'), - TimedeltaIndex(['10s', '201s']), - ]) - def test_fill_value_nat(self, arr): - cat = Categorical(arr) - assert cat._fill_value is NaT - - @pytest.mark.parametrize('arr', [ - [0, 1], - [True, False], - ['a', 'b'], - [0.0, 1.0], - ]) - def test_fill_value_nan(self, arr): - cat = Categorical(arr) - assert isna(cat._fill_value) diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index fb60b79e75ac5..815f4a61dea7e 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -139,14 +139,14 @@ def test_concat(self, data): assert result.dtype == data.dtype assert isinstance(result._data.blocks[0], ExtensionBlock) - def test_align(self, data): + def test_align(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(type(data)(list(a) + [data._fill_value])) - e2 = pd.Series(type(data)([data._fill_value] + list(b))) + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) @@ -223,9 +223,9 @@ def test_getitem_scalar(self, data): result = pd.Series(data)[0] assert isinstance(result, data.dtype.type) - def test_getitem_scalar_na(self, data_missing, na_cmp): + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] - assert na_cmp(result, data_missing._fill_value) + assert na_cmp(result, na_value) def test_getitem_mask(self, data): # Empty mask, raw array diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index d49c6184c5494..3e57dde385f6c 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -39,3 +39,9 @@ def na_cmp(): By defult, uses ``operator.or`` """ return operator.is_ + + +@pytest.fixture +def na_value(self): + """The scalar missing value for this type.""" + return None diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index cc22fdc953859..af56afed07f4d 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -31,6 +31,11 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def na_value(): + return np.nan + + class TestDtype(base.BaseDtypeTests): pass @@ -47,15 +52,15 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_align(self, data): + def test_align(self, data, na_value): # Override to pass through dtype a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - e1 = pd.Series(type(data)(list(a) + [data._fill_value], + e1 = pd.Series(type(data)(list(a) + [na_value], dtype=data.dtype)) - e2 = pd.Series(type(data)([data._fill_value] + list(b), + e2 = pd.Series(type(data)([na_value] + list(b), dtype=data.dtype)) tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index c574c9556f508..1758d0ed89d49 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -75,12 +75,12 @@ def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 out = self.values.take(indexer) - out[mask] = self._fill_value + out[mask] = self._na_value return type(self)(out) @property - def _fill_value(self): + def _na_value(self): return decimal.Decimal('NaN') @classmethod @@ -112,6 +112,11 @@ def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() +@pytest.fixture +def na_value(): + return decimal.Decimal("NaN") + + class TestDtype(base.BaseDtypeTests): pass @@ -125,14 +130,14 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_align(self, data): + def test_align(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # NaN handling - e1 = pd.Series(type(data)(list(a) + [data._fill_value])) - e2 = pd.Series(type(data)([data._fill_value] + list(b))) + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) assert r1[3].is_nan() assert e1[3].is_nan() diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/test_json.py index 466b8c191b533..547ed31de8d72 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/test_json.py @@ -80,10 +80,10 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self._fill_value for x in self.data]) + return np.array([x == self._na_value for x in self.data]) def take(self, indexer, allow_fill=True, fill_value=None): - output = [self.data[loc] if loc != -1 else self._fill_value + output = [self.data[loc] if loc != -1 else self._na_value for loc in indexer] return type(self)(output) @@ -91,7 +91,7 @@ def copy(self, deep=False): return type(self)(self.data[:]) @property - def _fill_value(self): + def _na_value(self): return {} @classmethod @@ -124,6 +124,11 @@ def data_missing(): return JSONArray([{}, {'a': 10}]) +@pytest.fixture +def na_value(): + return {} + + @pytest.fixture def na_cmp(): return operator.eq From f4cf45c2f122c4c81e40da7a864dbd69b70512eb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Feb 2018 22:40:09 -0600 Subject: [PATCH 095/119] Push coercion to extension dtype till later --- pandas/core/series.py | 7 ------- pandas/tests/extension/test_decimal.py | 7 ------- 2 files changed, 14 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7e98ff4deb5d7..9e98908f601c8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3152,13 +3152,6 @@ def _sanitize_array(data, index, dtype=None, copy=False, if dtype is not None: dtype = pandas_dtype(dtype) - if is_extension_array_dtype(dtype) and not is_extension_array_dtype(data): - # Just check for any extension dtype data here. We validatate that - # the exact types match later. - raise ValueError("Cannot coerce data to extension dtype '{type}'. " - "Pass the extension array for '{type}' " - "directly instead.".format(type=dtype)) - if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index 1758d0ed89d49..dad2bc45c5e08 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -170,13 +170,6 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) -def test_series_constructor_with_dtype_coercion_raises(): - xpr = ("Cannot coerce data to extension dtype 'decimal'. Pass the " - "extension array for 'decimal' directly instead.") - with tm.assert_raises_regex(ValueError, xpr): - pd.Series([0, 1, 2], dtype=DecimalDtype()) - - def test_series_constructor_with_same_dtype_ok(): arr = DecimalArray([decimal.Decimal('10.0')]) result = pd.Series(arr, dtype=DecimalDtype()) From 9c5d47953184a21049eaa076fabbd74e36c28ce7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 06:03:52 -0600 Subject: [PATCH 096/119] Linting --- pandas/core/arrays/base.py | 1 + pandas/core/dtypes/missing.py | 6 +++--- pandas/tests/extension/base.py | 1 - 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9c7c395ef994f..ed3cb99e8a998 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -57,6 +57,7 @@ class ExtensionArray(object): # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ + def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b4a05a24aabc9..36dbb0ee4b98f 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -137,9 +137,9 @@ def _isna_ndarraylike(obj): values = obj result = values.isna() elif is_interval_dtype(values): - # TODO(IntervalArray): remove this if block - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() + # TODO(IntervalArray): remove this if block + from pandas import IntervalIndex + result = IntervalIndex(obj).isna() elif is_string_dtype(dtype): # Working around NumPy ticket 1542 shape = values.shape diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index 815f4a61dea7e..40d619d1516a7 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -328,4 +328,3 @@ def test_count(self, data_missing): result = df.count(axis='columns') expected = pd.Series([0, 1]) tm.assert_series_equal(result, expected) - From 1175c0dc1abc6af4b62bddfdc32f4237e736f5ad Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 06:19:38 -0600 Subject: [PATCH 097/119] ERR: Better error message for coercion to 3rd party dtypes --- pandas/core/series.py | 9 +++++++++ pandas/tests/extension/test_decimal.py | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e98908f601c8..70f7444fae69d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3173,8 +3173,17 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, ordered=dtype.ordered) + elif is_extension_array_dtype(dtype): + # We don't allow casting to third party dtypes, since we don't + # know what array belongs to which type. + msg = ("Cannot cast data to extension dtype '{}'. " + "Pass the extension array directly.".format(dtype)) + raise ValueError(msg) + elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index dad2bc45c5e08..d6ddd09d1f356 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -170,6 +170,13 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) +def test_series_constructor_coerce_data_to_extension_dtype_raises(): + xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly.") + with tm.assert_raises_regex(ValueError, xpr): + pd.Series([0, 1, 2], dtype=DecimalDtype()) + + def test_series_constructor_with_same_dtype_ok(): arr = DecimalArray([decimal.Decimal('10.0')]) result = pd.Series(arr, dtype=DecimalDtype()) @@ -177,7 +184,7 @@ def test_series_constructor_with_same_dtype_ok(): tm.assert_series_equal(result, expected) -def test_series_constructor_with_different_dtype_raises(): +def test_series_constructor_coerce_extension_array_to_dtype_raises(): arr = DecimalArray([decimal.Decimal('10.0')]) xpr = "Cannot specify a dtype 'int64' .* \('decimal'\)." From c816d99b83f05abd37094f720fe0a2b04473df79 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 06:34:13 -0600 Subject: [PATCH 098/119] CLN: Make take_nd EA aware --- pandas/core/algorithms.py | 19 ++++++++++++++----- pandas/core/series.py | 7 +------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 099a1411ebae7..20ca7eaaf2a53 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1291,10 +1291,12 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ Specialized Cython take which sets NaN values in one pass + This dispatches to ``take`` defined on ExtensionArrays. + Parameters ---------- - arr : ndarray - Input array + arr : ndarray, ExtensionArray, DatetimeIndex, IntervalIndex, SparseArray + Input array. SparseArray is densified with ``get_values`` indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value @@ -1314,16 +1316,23 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : object + May be the same type as the input, or cast to an ndarray. """ + # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes - if is_categorical(arr): - return arr.take_nd(indexer, fill_value=fill_value, - allow_fill=allow_fill) + if is_extension_array_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + elif is_sparse(arr): + arr = arr.get_values() if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/series.py b/pandas/core/series.py index 70f7444fae69d..0a078401fd4e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2570,12 +2570,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - # be subclass-friendly - if isinstance(self._values, ExtensionArray): - new_values = self._values.take(indexer) - else: - new_values = algorithms.take_1d(self.get_values(), indexer) - + new_values = algorithms.take_1d(self._values, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): From 9c9f59ec125fa5d7aeda0be63529807daf928a7f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 13:15:52 -0600 Subject: [PATCH 099/119] Revert sparse changes --- pandas/core/algorithms.py | 9 ++++----- pandas/core/series.py | 8 +++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 20ca7eaaf2a53..d22fe1e3bcb47 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -20,7 +20,7 @@ is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, - is_categorical, is_datetimetz, + is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, @@ -1291,12 +1291,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ Specialized Cython take which sets NaN values in one pass - This dispatches to ``take`` defined on ExtensionArrays. + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. Parameters ---------- arr : ndarray, ExtensionArray, DatetimeIndex, IntervalIndex, SparseArray - Input array. SparseArray is densified with ``get_values`` + Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value @@ -1331,8 +1332,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_sparse(arr): - arr = arr.get_values() if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0a078401fd4e9..ea7bf36dc4a0a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,6 +29,7 @@ is_iterator, is_dict_like, is_scalar, + is_sparse, _is_unorderable_exception, _ensure_platform_int, pandas_dtype) @@ -2570,7 +2571,12 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer) + if is_sparse(self): + arr = self.get_values() + else: + arr = self._values + + new_values = algorithms.take_1d(arr, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): From 08af9a36e8700e7f1aefec4f514a7dde6a04f16c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 14:12:58 -0600 Subject: [PATCH 100/119] Other _typ for ABCExtensionArray --- pandas/core/arrays/base.py | 6 ++++-- pandas/core/dtypes/generic.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ed3cb99e8a998..887326625e2ad 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -53,7 +53,9 @@ class ExtensionArray(object): the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. """ - _typ = 'extension' # For pandas.core.dtypes.generic.ABCExtensionArray + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. + # Don't override this. + _typ = 'extension' # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -267,7 +269,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 result = self.data.take(indexer) - result[mask] = self._fill_value # NA for this type + result[mask] = np.nan # NA for this type return type(self)(result) See Also diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index b841322bf93e1..cb54c94d29205 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -58,7 +58,7 @@ def _check(cls, inst): ("dateoffset",)) ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", - ("extension",)) + ("extension", "categorical",)) class _ABCGeneric(type): From 2e992f7fe6780db48054c9b425448d95138174a9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 14:14:06 -0600 Subject: [PATCH 101/119] Test cleanup and expansion. Tests for concating and aligning frames --- pandas/tests/extension/base.py | 67 ++++++++++++++++++++-- pandas/tests/extension/conftest.py | 3 +- pandas/tests/extension/test_categorical.py | 19 ++---- pandas/tests/extension/test_decimal.py | 25 +++++++- 4 files changed, 92 insertions(+), 22 deletions(-) diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index 40d619d1516a7..4e69123aa3cbc 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -1,3 +1,37 @@ +"""Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. +""" import numpy as np import pytest @@ -130,13 +164,21 @@ def test_series_given_mismatched_index_raises(self, data): class BaseReshapingTests(object): """Tests for reshaping and concatenation.""" - def test_concat(self, data): - result = pd.concat([ - pd.Series(data), - pd.Series(data), - ], ignore_index=True) + @pytest.mark.parametrize('in_frame', [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + assert len(result) == len(data) * 2 - assert result.dtype == data.dtype + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype assert isinstance(result._data.blocks[0], ExtensionBlock) def test_align(self, data, na_value): @@ -150,6 +192,19 @@ def test_align(self, data, na_value): tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + tm.assert_frame_equal(r1, e1) + tm.assert_frame_equal(r2, e2) + class BaseGetitemTests(object): """Tests for ExtensionArray.__getitem__.""" diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 3e57dde385f6c..9664d237afcb6 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -23,6 +23,7 @@ def data_missing(): @pytest.fixture(params=['data', 'data_missing']) def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" if request.param == 'data': return data elif request.param == 'data_missing': @@ -43,5 +44,5 @@ def na_cmp(): @pytest.fixture def na_value(self): - """The scalar missing value for this type.""" + """The scalar missing value for this type. Default 'None'""" return None diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index af56afed07f4d..c1732653e993a 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -3,8 +3,6 @@ import pytest import numpy as np -import pandas as pd -import pandas.util.testing as tm from pandas.api.types import CategoricalDtype from pandas import Categorical from . import base @@ -52,18 +50,13 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") def test_align(self, data, na_value): - # Override to pass through dtype - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + pass - e1 = pd.Series(type(data)(list(a) + [na_value], - dtype=data.dtype)) - e2 = pd.Series(type(data)([na_value] + list(b), - dtype=data.dtype)) - tm.assert_series_equal(r1, e1) - tm.assert_series_equal(r2, e2) + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_align_frame(self, data, na_value): + pass class TestGetitem(base.BaseGetitemTests): @@ -82,6 +75,6 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): pass - @pytest.mark.skip(reason="Different value_counts semantics.") + @pytest.mark.skip(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): pass diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index d6ddd09d1f356..cb0e96b59b919 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -38,8 +38,6 @@ def __init__(self, values): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.values[item] - elif isinstance(item, np.ndarray) and item.dtype == 'bool': - return type(self)([x for x, m in zip(self, item) if m]) else: return type(self)(self.values[item]) @@ -130,7 +128,10 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): + def test_align(self, data, na_value): + # Have to override since assert_series_equal doesn't + # compare Decimal(NaN) properly. a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) @@ -146,6 +147,26 @@ def test_align(self, data, na_value): assert r2[0].is_nan() assert e2[0].is_nan() + def test_align_frame(self, data, na_value): + # Override for Decimal(NaN) comparison + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + + tm.assert_frame_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1.loc[3, 'A'].is_nan() + assert e1.loc[3, 'A'].is_nan() + + tm.assert_frame_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2.loc[0, 'A'].is_nan() + assert e2.loc[0, 'A'].is_nan() + class TestGetitem(base.BaseGetitemTests): pass From cc5cc3e2e2d8e9c98a78752fd4f83e54f327c708 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 14:14:31 -0600 Subject: [PATCH 102/119] Copy if copy --- pandas/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index ea7bf36dc4a0a..47ea5743d2a52 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3134,6 +3134,8 @@ def _sanitize_index(data, index, copy=False): pass elif isinstance(data, (PeriodIndex, DatetimeIndex)): data = data._values + if copy: + data = data.copy() elif isinstance(data, np.ndarray): From 704ee67378a747fd454ba8e751b5a5802c36b520 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Feb 2018 19:26:48 -0600 Subject: [PATCH 103/119] TST: remove self param for fixture --- pandas/tests/extension/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 9664d237afcb6..f86849b9cbd61 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -43,6 +43,6 @@ def na_cmp(): @pytest.fixture -def na_value(self): +def na_value(): """The scalar missing value for this type. Default 'None'""" return None From 8bf0334eaafd949756b52711a60961e6adf5e57f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Feb 2018 09:15:42 -0600 Subject: [PATCH 104/119] Remove unnescessary EA handling in Series ctor --- pandas/core/series.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index d18cd4baa297b..ec4d914f10b0c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -243,10 +243,6 @@ def __init__(self, data=None, index=None, dtype=None, name=None, copy=copy) elif copy: data = data.copy() - elif isinstance(data, ExtensionArray): - if copy: - data = data.copy() - data = SingleBlockManager(data, index, fastpath=True) else: data = _sanitize_array(data, index, dtype, copy, raise_cast_failure=True) From c8d88da0d68ee70ea7f68462e8e29da4a7c22d1e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Feb 2018 09:24:48 -0600 Subject: [PATCH 105/119] API: Removed value_counts Moved setitem notes to comment --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 60 ++++++++++---------------- pandas/tests/extension/test_decimal.py | 3 +- 3 files changed, 26 insertions(+), 39 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d22fe1e3bcb47..2ea714fa2738c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -543,7 +543,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, else: - if is_extension_array_dtype(values) or is_sparse(values): + if is_categorical_dtype(values) or is_sparse(values): # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 887326625e2ad..0df256fa0cddd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -95,9 +95,12 @@ def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None """Set one or more values inplace. + This method is not required to satisfy the pandas extension array + interface. + Parameters ---------- - key : int or ndarray + key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of @@ -109,24 +112,26 @@ def __setitem__(self, key, value): value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. - Notes - ----- - This method is not required to satisfy the interface. If an - ExtensionArray chooses to implement __setitem__, then some semantics - should be observed: - - * Setting multiple values : ExtensionArrays should support setting - multiple values at once, ``key`` will be a sequence of integers and - ``value`` will be a same-length sequence. - - * Broadcasting : For a sequence ``key`` and a scalar ``value``, - each position in ``key`` should be set to ``value``. - - * Coercion : Most users will expect basic coercion to work. For - example, a string like ``'2018-01-01'`` is coerced to a datetime - when setting on a datetime64ns array. In general, if the - ``__init__`` method coerces that value, then so should ``__setitem__``. + Returns + ------- + None """ + # Some notes to the ExtensionArray implementor who may have ended up + # here. While this method is not required for the interface, if you + # *do* choose to implement __setitem__, then some semantics should be + # observed: + # + # * Setting multiple values : ExtensionArrays should support setting + # multiple values at once, 'key' will be a sequence of integers and + # 'value' will be a same-length sequence. + # + # * Broadcasting : For a sequence 'key' and a scalar 'value', + # each position in 'key' should be set to 'value'. + # + # * Coercion : Most users will expect basic coercion to work. For + # example, a string like '2018-01-01' is coerced to a datetime + # when setting on a datetime64ns array. In general, if the + # __init__ method coerces that value, then so should __setitem__ raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') ) @@ -211,25 +216,6 @@ def isna(self): """ raise AbstractMethodError(self) - def value_counts(self, dropna=True): - """Compute a histogram of the counts of non-null values. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN - - Returns - ------- - value_counts : Series - """ - from pandas import value_counts - - if dropna: - self = self[~self.isna()] - - return value_counts(np.array(self)) - # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index cb0e96b59b919..9cd19840bb274 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -177,7 +177,8 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.xfail(reason="NaN Sorting") + @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: From 24f3b6095b9da19e0a56f93e12fed2d1909c8527 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Feb 2018 09:41:41 -0600 Subject: [PATCH 106/119] More doc notes --- pandas/core/arrays/base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0df256fa0cddd..b89080349058c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -147,12 +147,12 @@ def __len__(self): raise AbstractMethodError(self) def __iter__(self): - """Iterate over elements. + """Iterate over elements of the array. - This needs to be implemented so that pandas recognizes extension arrays - as list-like. The default implementation makes successive calls to - ``__getitem__``, which may be slower than necessary. """ + # This needs to be implemented so that pandas recognizes extension arrays + # as list-like. The default implementation makes successive calls to + # ``__getitem__``, which may be slower than necessary. for i in range(len(self)): yield self[i] @@ -181,9 +181,9 @@ def nbytes(self): # type: () -> int """The number of bytes needed to store this object in memory. - If this is expensive to compute, return an approximate lower bound - on the number of bytes needed. """ + # If this is expensive to compute, return an approximate lower bound + # on the number of bytes needed. raise AbstractMethodError(self) # ------------------------------------------------------------------------ From 50bd5dd89ee586847f317a773742730c3320ed5e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Feb 2018 10:08:37 -0600 Subject: [PATCH 107/119] Handle expanding a DataFrame with an EA --- pandas/core/internals.py | 8 ++++++-- pandas/tests/extension/base.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1dccbf2e7ff96..bad0626206e80 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -59,6 +59,7 @@ from pandas.core.dtypes.generic import ( ABCSeries, ABCDatetimeIndex, + ABCExtensionArray, ABCIndexClass) import pandas.core.common as com import pandas.core.algorithms as algos @@ -4141,7 +4142,10 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_extension_type = is_extension_type(value) + # TODO(EA): Remove an is_extension_ when all extension types satisfy + # the interface + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) # categorical/spares/datetimetz if value_is_extension_type: @@ -5198,7 +5202,7 @@ def _safe_reshape(arr, new_shape): """ if isinstance(arr, ABCSeries): arr = arr._values - if not isinstance(arr, Categorical): + if not isinstance(arr, ABCExtensionArray): arr = arr.reshape(new_shape) return arr diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index 4e69123aa3cbc..a7d80e870d550 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -205,6 +205,18 @@ def test_align_frame(self, data, na_value): tm.assert_frame_equal(r1, e1) tm.assert_frame_equal(r2, e2) + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df['B'] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({'A': data}) + df['B'] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + tm.assert_frame_equal(df, expected) + class BaseGetitemTests(object): """Tests for ExtensionArray.__getitem__.""" From 879bc8425a8f45ed5cdbc8403455b8fd6bd7f25c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Feb 2018 11:38:56 -0600 Subject: [PATCH 108/119] Added ExtensionDtype.__eq__ Support for astype --- pandas/core/dtypes/base.py | 27 ++++++++++++++++++++++ pandas/core/dtypes/dtypes.py | 7 ------ pandas/tests/extension/base.py | 23 +++++++++++++++++- pandas/tests/extension/test_categorical.py | 5 ++++ pandas/tests/extension/test_decimal.py | 4 ++++ pandas/tests/extension/test_json.py | 5 ++++ 6 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 17d375e67808b..65e8cb5dd00a7 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,6 +1,7 @@ """Extend pandas with custom array types""" import inspect +from pandas import compat from pandas.errors import AbstractMethodError @@ -25,6 +26,32 @@ class ExtensionDtype(object): def __str__(self): return self.name + def __eq__(self, other): + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, compat.string_types): + return other == self.name + elif isinstance(other, type(self)): + return True + else: + return False + + def __ne__(self, other): + return not self.__eq__(other) + @property def type(self): # type: () -> type diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 99e4033f104db..a972cb942c620 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -66,13 +66,6 @@ def __hash__(self): raise NotImplementedError("sub-classes should implement an __hash__ " "method") - def __eq__(self, other): - raise NotImplementedError("sub-classes should implement an __eq__ " - "method") - - def __ne__(self, other): - return not self.__eq__(other) - def __getstate__(self): # pickle support; we don't want to pickle the cache return {k: getattr(self, k, None) for k in self._metadata} diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index a7d80e870d550..fbf084617f252 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -38,7 +38,7 @@ class TestMyDtype(BaseDtypeTests): import pandas as pd import pandas.util.testing as tm from pandas.compat import StringIO -from pandas.core.internals import ExtensionBlock +from pandas.core.internals import ExtensionBlock, ObjectBlock from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -76,6 +76,13 @@ def test_is_not_string_type(self, dtype): def test_is_not_object_type(self, dtype): return not pd.api.types.is_object_dtype(dtype) + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + '-suffix' + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype('object') + class BaseInterfaceTests(object): """Tests that the basic interface is satisfied.""" @@ -395,3 +402,17 @@ def test_count(self, data_missing): result = df.count(axis='columns') expected = pd.Series([0, 1]) tm.assert_series_equal(result, expected) + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) + + +class BaseCastingTests(object): + """Casting to and from ExtensionDtypes""" + + def test_astype_object_series(self, all_data): + ser = pd.Series({"A": all_data}) + result = ser.astype(object) + assert isinstance(result._data.blocks[0], ObjectBlock) + diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index c1732653e993a..a7d0c02a5b2a2 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -78,3 +78,8 @@ class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): pass + + +class TestCasting(base.BaseCastingTests): + pass + diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/test_decimal.py index 9cd19840bb274..46ca08f94c72b 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/test_decimal.py @@ -192,6 +192,10 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) +class TestCasting(base.BaseCastingTests): + pass + + def test_series_constructor_coerce_data_to_extension_dtype_raises(): xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " "extension array directly.") diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/test_json.py index 547ed31de8d72..a662465425045 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/test_json.py @@ -162,3 +162,8 @@ class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unhashable") def test_value_counts(self, all_data, dropna): pass + + +class TestCasting(base.BaseCastingTests): + pass + From 33c9d1f1142e91f23fc01fb4ce0ddeb7e4461e88 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Feb 2018 06:58:23 -0600 Subject: [PATCH 109/119] linting --- pandas/core/arrays/base.py | 6 +++--- pandas/tests/extension/base.py | 1 - pandas/tests/extension/test_categorical.py | 1 - pandas/tests/extension/test_json.py | 1 - 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b89080349058c..cec881394a021 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -150,9 +150,9 @@ def __iter__(self): """Iterate over elements of the array. """ - # This needs to be implemented so that pandas recognizes extension arrays - # as list-like. The default implementation makes successive calls to - # ``__getitem__``, which may be slower than necessary. + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. for i in range(len(self)): yield self[i] diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index fbf084617f252..deaa48a6f6a26 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -415,4 +415,3 @@ def test_astype_object_series(self, all_data): ser = pd.Series({"A": all_data}) result = ser.astype(object) assert isinstance(result._data.blocks[0], ObjectBlock) - diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index a7d0c02a5b2a2..fbec835a72ce8 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -82,4 +82,3 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): pass - diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/test_json.py index a662465425045..ecaa36b6db9c9 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/test_json.py @@ -166,4 +166,3 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): pass - From f07c166e0447dd927d09f624b05724f7f8914ff3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Feb 2018 09:07:36 -0600 Subject: [PATCH 110/119] REF: is_dtype_equal refactor Moved from PandasExtensionDtype to ExtensionDtype with one modification: catch TypeError explicitly. --- pandas/core/dtypes/base.py | 29 +++++++++++++++++------------ pandas/core/dtypes/dtypes.py | 18 ------------------ pandas/tests/extension/base.py | 3 +++ 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 65e8cb5dd00a7..a1f20870e1124 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,5 +1,5 @@ """Extend pandas with custom array types""" -import inspect +import numpy as np from pandas import compat from pandas.errors import AbstractMethodError @@ -131,12 +131,12 @@ def construct_from_string(cls, string): @classmethod def is_dtype(cls, dtype): - """Check if we match 'dtype' + """Check if we match 'dtype'. Parameters ---------- dtype : str, object, or type - The dtype to check. + The object to check. Returns ------- @@ -150,13 +150,18 @@ def is_dtype(cls, dtype): of ``cls``. 2. ``dtype`` is an object and is an instance of ``cls`` 3. ``dtype`` is a class and is ``cls`` or a subclass of ``cls``. + 4. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. """ - if isinstance(dtype, str): - try: - return isinstance(cls.construct_from_string(dtype), cls) - except TypeError: - return False - elif inspect.isclass(dtype): - return issubclass(dtype, cls) - else: - return isinstance(dtype, cls) + dtype = getattr(dtype, 'dtype', dtype) + + if isinstance(dtype, np.dtype): + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a972cb942c620..d262a71933915 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -75,24 +75,6 @@ def reset_cache(cls): """ clear the cache """ cls._cache = {} - @classmethod - def is_dtype(cls, dtype): - """ Return a boolean if the passed type is an actual dtype that - we can match (via string or type) - """ - if hasattr(dtype, 'dtype'): - dtype = dtype.dtype - if isinstance(dtype, np.dtype): - return False - elif dtype is None: - return False - elif isinstance(dtype, cls): - return True - try: - return cls.construct_from_string(dtype) is not None - except: - return False - class CategoricalDtypeType(type): """ diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py index deaa48a6f6a26..6ec93734a2787 100644 --- a/pandas/tests/extension/base.py +++ b/pandas/tests/extension/base.py @@ -66,6 +66,9 @@ def test_is_dtype_from_name(self, dtype): result = type(dtype).is_dtype(dtype.name) assert result is True + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True From 79d43b1aafb3e0ec67bbf0a5c9d6d6312a76d14b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Feb 2018 09:24:13 -0600 Subject: [PATCH 111/119] Remove reference to dtype being a class --- pandas/core/dtypes/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a1f20870e1124..d54d980d02ffa 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -135,7 +135,7 @@ def is_dtype(cls, dtype): Parameters ---------- - dtype : str, object, or type + dtype : object The object to check. Returns @@ -149,8 +149,7 @@ def is_dtype(cls, dtype): 1. ``cls.construct_from_string(dtype)`` is an instance of ``cls``. 2. ``dtype`` is an object and is an instance of ``cls`` - 3. ``dtype`` is a class and is ``cls`` or a subclass of ``cls``. - 4. ``dtype`` has a ``dtype`` attribute, and any of the above + 3. ``dtype`` has a ``dtype`` attribute, and any of the above conditions is true for ``dtype.dtype``. """ dtype = getattr(dtype, 'dtype', dtype) From a1ebf5301e4dc12d77a8a76f78f6cea117fa1917 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Feb 2018 19:55:07 -0600 Subject: [PATCH 112/119] move --- pandas/tests/{internals => extension}/test_external_block.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{internals => extension}/test_external_block.py (100%) diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/extension/test_external_block.py similarity index 100% rename from pandas/tests/internals/test_external_block.py rename to pandas/tests/extension/test_external_block.py From aa57cad421cb96a559318ae72ef53eadd2d818ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 08:27:20 -0600 Subject: [PATCH 113/119] Moved sparse check to take_nd --- pandas/core/algorithms.py | 3 +++ pandas/core/series.py | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a545bb1005405..c175fe4c9ebff 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1335,6 +1335,9 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + if is_sparse(arr): + arr = arr.get_values() + if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() diff --git a/pandas/core/series.py b/pandas/core/series.py index ec4d914f10b0c..c9df1d60895d2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2565,12 +2565,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - if is_sparse(self): - arr = self.get_values() - else: - arr = self._values - - new_values = algorithms.take_1d(arr, indexer) + new_values = algorithms.take_1d(self._values, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): From c82748cb51461460a19ce6de6c3fb3424fce5cc4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 08:30:54 -0600 Subject: [PATCH 114/119] Docstring --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c175fe4c9ebff..d616e3f92aa4d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1298,7 +1298,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, Parameters ---------- - arr : ndarray, ExtensionArray, DatetimeIndex, IntervalIndex, SparseArray + arr : array-like Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value @@ -1322,7 +1322,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, Returns ------- - subarray : object + subarray : array-like May be the same type as the input, or cast to an ndarray. """ From e91934364cb2eb161ea6bf3623fcb49a673c3b7e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 08:51:05 -0600 Subject: [PATCH 115/119] Split tests --- pandas/tests/extension/base.py | 420 ------------------ pandas/tests/extension/base/__init__.py | 42 ++ pandas/tests/extension/base/casting.py | 11 + pandas/tests/extension/base/constructors.py | 43 ++ pandas/tests/extension/base/dtype.py | 46 ++ pandas/tests/extension/base/getitem.py | 119 +++++ pandas/tests/extension/base/interface.py | 53 +++ pandas/tests/extension/base/methods.py | 32 ++ pandas/tests/extension/base/missing.py | 45 ++ pandas/tests/extension/base/reshaping.py | 61 +++ pandas/tests/extension/category/__init__.py | 0 .../{ => category}/test_categorical.py | 2 +- pandas/tests/extension/decimal/__init__.py | 0 .../tests/extension/decimal/decimal_array.py | 86 ++++ .../extension/{ => decimal}/test_decimal.py | 84 +--- pandas/tests/extension/json/__init__.py | 0 .../{test_json.py => json/json_array.py} | 69 --- pandas/tests/extension/json/test_json.py | 73 +++ 18 files changed, 614 insertions(+), 572 deletions(-) delete mode 100644 pandas/tests/extension/base.py create mode 100644 pandas/tests/extension/base/__init__.py create mode 100644 pandas/tests/extension/base/casting.py create mode 100644 pandas/tests/extension/base/constructors.py create mode 100644 pandas/tests/extension/base/dtype.py create mode 100644 pandas/tests/extension/base/getitem.py create mode 100644 pandas/tests/extension/base/interface.py create mode 100644 pandas/tests/extension/base/methods.py create mode 100644 pandas/tests/extension/base/missing.py create mode 100644 pandas/tests/extension/base/reshaping.py create mode 100644 pandas/tests/extension/category/__init__.py rename pandas/tests/extension/{ => category}/test_categorical.py (97%) create mode 100644 pandas/tests/extension/decimal/__init__.py create mode 100644 pandas/tests/extension/decimal/decimal_array.py rename pandas/tests/extension/{ => decimal}/test_decimal.py (66%) create mode 100644 pandas/tests/extension/json/__init__.py rename pandas/tests/extension/{test_json.py => json/json_array.py} (72%) create mode 100644 pandas/tests/extension/json/test_json.py diff --git a/pandas/tests/extension/base.py b/pandas/tests/extension/base.py deleted file mode 100644 index 6ec93734a2787..0000000000000 --- a/pandas/tests/extension/base.py +++ /dev/null @@ -1,420 +0,0 @@ -"""Base test suite for extension arrays. - -These tests are intended for third-party libraries to subclass to validate -that their extension arrays and dtypes satisfy the interface. Moving or -renaming the tests should not be done lightly. - -Libraries are expected to implement a few pytest fixtures to provide data -for the tests. The fixtures may be located in either - -* The same module as your test class. -* A ``conftest.py`` in the same directory as your test class. - -The full list of fixtures may be found in the ``conftest.py`` next to this -file. - -.. code-block:: python - - import pytest - from pandas.tests.extension.base import BaseDtypeTests - - - @pytest.fixture - def dtype(): - return MyDtype() - - - class TestMyDtype(BaseDtypeTests): - pass - - -Your class ``TestDtype`` will inherit all the tests defined on -``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` -wherever the test requires it. You're free to implement additional tests. -""" -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm -from pandas.compat import StringIO -from pandas.core.internals import ExtensionBlock, ObjectBlock -from pandas.core.dtypes.common import is_extension_array_dtype -from pandas.core.dtypes.dtypes import ExtensionDtype - - -class BaseDtypeTests(object): - """Base class for ExtensionDtype classes""" - - def test_name(self, dtype): - assert isinstance(dtype.name, str) - - def test_kind(self, dtype): - valid = set('biufcmMOSUV') - if dtype.kind is not None: - assert dtype.kind in valid - - def test_construct_from_string_own_name(self, dtype): - result = dtype.construct_from_string(dtype.name) - assert type(result) is type(dtype) - - # check OK as classmethod - result = type(dtype).construct_from_string(dtype.name) - assert type(result) is type(dtype) - - def test_is_dtype_from_name(self, dtype): - result = type(dtype).is_dtype(dtype.name) - assert result is True - - def test_is_dtype_unboxes_dtype(self, data, dtype): - assert dtype.is_dtype(data) is True - - def test_is_dtype_from_self(self, dtype): - result = type(dtype).is_dtype(dtype) - assert result is True - - def test_is_not_string_type(self, dtype): - return not pd.api.types.is_string_dtype(dtype) - - def test_is_not_object_type(self, dtype): - return not pd.api.types.is_object_dtype(dtype) - - def test_eq_with_str(self, dtype): - assert dtype == dtype.name - assert dtype != dtype.name + '-suffix' - - def test_eq_with_numpy_object(self, dtype): - assert dtype != np.dtype('object') - - -class BaseInterfaceTests(object): - """Tests that the basic interface is satisfied.""" - # ------------------------------------------------------------------------ - # Interface - # ------------------------------------------------------------------------ - - def test_len(self, data): - assert len(data) == 100 - - def test_ndim(self, data): - assert data.ndim == 1 - - def test_can_hold_na_valid(self, data): - assert data._can_hold_na in {True, False} - - def test_memory_usage(self, data): - s = pd.Series(data) - result = s.memory_usage(index=False) - assert result == s.nbytes - - def test_array_interface(self, data): - result = np.array(data) - assert result[0] == data[0] - - def test_as_ndarray_with_dtype_kind(self, data): - np.array(data, dtype=data.dtype.kind) - - def test_repr(self, data): - ser = pd.Series(data) - assert data.dtype.name in repr(ser) - - df = pd.DataFrame({"A": data}) - repr(df) - - def test_dtype_name_in_info(self, data): - buf = StringIO() - pd.DataFrame({"A": data}).info(buf=buf) - result = buf.getvalue() - assert data.dtype.name in result - - def test_is_extension_array_dtype(self, data): - assert is_extension_array_dtype(data) - assert is_extension_array_dtype(data.dtype) - assert is_extension_array_dtype(pd.Series(data)) - assert isinstance(data.dtype, ExtensionDtype) - - -class BaseConstructorsTests(object): - - def test_series_constructor(self, data): - result = pd.Series(data) - assert result.dtype == data.dtype - assert len(result) == len(data) - assert isinstance(result._data.blocks[0], ExtensionBlock) - assert result._data.blocks[0].values is data - - # Series[EA] is unboxed / boxed correctly - result2 = pd.Series(result) - assert result2.dtype == data.dtype - assert isinstance(result2._data.blocks[0], ExtensionBlock) - - @pytest.mark.parametrize("from_series", [True, False]) - def test_dataframe_constructor_from_dict(self, data, from_series): - if from_series: - data = pd.Series(data) - result = pd.DataFrame({"A": data}) - assert result.dtypes['A'] == data.dtype - assert result.shape == (len(data), 1) - assert isinstance(result._data.blocks[0], ExtensionBlock) - - def test_dataframe_from_series(self, data): - result = pd.DataFrame(pd.Series(data)) - assert result.dtypes[0] == data.dtype - assert result.shape == (len(data), 1) - assert isinstance(result._data.blocks[0], ExtensionBlock) - - @pytest.mark.xfail(reason="GH-19342") - def test_series_given_mismatched_index_raises(self, data): - msg = 'Wrong number of items passed 3, placement implies 4' - with tm.assert_raises_regex(ValueError, None) as m: - pd.Series(data[:3], index=[0, 1, 2, 3, 4]) - - assert m.match(msg) - - -class BaseReshapingTests(object): - """Tests for reshaping and concatenation.""" - @pytest.mark.parametrize('in_frame', [True, False]) - def test_concat(self, data, in_frame): - wrapped = pd.Series(data) - if in_frame: - wrapped = pd.DataFrame(wrapped) - result = pd.concat([wrapped, wrapped], ignore_index=True) - - assert len(result) == len(data) * 2 - - if in_frame: - dtype = result.dtypes[0] - else: - dtype = result.dtype - - assert dtype == data.dtype - assert isinstance(result._data.blocks[0], ExtensionBlock) - - def test_align(self, data, na_value): - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) - - # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(type(data)(list(a) + [na_value])) - e2 = pd.Series(type(data)([na_value] + list(b))) - tm.assert_series_equal(r1, e1) - tm.assert_series_equal(r2, e2) - - def test_align_frame(self, data, na_value): - a = data[:3] - b = data[2:5] - r1, r2 = pd.DataFrame({'A': a}).align( - pd.DataFrame({'A': b}, index=[1, 2, 3]) - ) - - # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) - e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) - tm.assert_frame_equal(r1, e1) - tm.assert_frame_equal(r2, e2) - - def test_set_frame_expand_regular_with_extension(self, data): - df = pd.DataFrame({"A": [1] * len(data)}) - df['B'] = data - expected = pd.DataFrame({"A": [1] * len(data), "B": data}) - tm.assert_frame_equal(df, expected) - - def test_set_frame_expand_extension_with_regular(self, data): - df = pd.DataFrame({'A': data}) - df['B'] = [1] * len(data) - expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) - tm.assert_frame_equal(df, expected) - - -class BaseGetitemTests(object): - """Tests for ExtensionArray.__getitem__.""" - - def test_iloc_series(self, data): - ser = pd.Series(data) - result = ser.iloc[:4] - expected = pd.Series(data[:4]) - tm.assert_series_equal(result, expected) - - result = ser.iloc[[0, 1, 2, 3]] - tm.assert_series_equal(result, expected) - - def test_iloc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) - expected = pd.DataFrame({"A": data[:4]}) - - # slice -> frame - result = df.iloc[:4, [0]] - tm.assert_frame_equal(result, expected) - - # sequence -> frame - result = df.iloc[[0, 1, 2, 3], [0]] - tm.assert_frame_equal(result, expected) - - expected = pd.Series(data[:4], name='A') - - # slice -> series - result = df.iloc[:4, 0] - tm.assert_series_equal(result, expected) - - # sequence -> series - result = df.iloc[:4, 0] - tm.assert_series_equal(result, expected) - - def test_loc_series(self, data): - ser = pd.Series(data) - result = ser.loc[:3] - expected = pd.Series(data[:4]) - tm.assert_series_equal(result, expected) - - result = ser.loc[[0, 1, 2, 3]] - tm.assert_series_equal(result, expected) - - def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) - expected = pd.DataFrame({"A": data[:4]}) - - # slice -> frame - result = df.loc[:3, ['A']] - tm.assert_frame_equal(result, expected) - - # sequence -> frame - result = df.loc[[0, 1, 2, 3], ['A']] - tm.assert_frame_equal(result, expected) - - expected = pd.Series(data[:4], name='A') - - # slice -> series - result = df.loc[:3, 'A'] - tm.assert_series_equal(result, expected) - - # sequence -> series - result = df.loc[:3, 'A'] - tm.assert_series_equal(result, expected) - - def test_getitem_scalar(self, data): - result = data[0] - assert isinstance(result, data.dtype.type) - - result = pd.Series(data)[0] - assert isinstance(result, data.dtype.type) - - def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): - result = data_missing[0] - assert na_cmp(result, na_value) - - def test_getitem_mask(self, data): - # Empty mask, raw array - mask = np.zeros(len(data), dtype=bool) - result = data[mask] - assert len(result) == 0 - assert isinstance(result, type(data)) - - # Empty mask, in series - mask = np.zeros(len(data), dtype=bool) - result = pd.Series(data)[mask] - assert len(result) == 0 - assert result.dtype == data.dtype - - # non-empty mask, raw array - mask[0] = True - result = data[mask] - assert len(result) == 1 - assert isinstance(result, type(data)) - - # non-empty mask, in series - result = pd.Series(data)[mask] - assert len(result) == 1 - assert result.dtype == data.dtype - - def test_getitem_slice(self, data): - # getitem[slice] should return an array - result = data[slice(0)] # empty - assert isinstance(result, type(data)) - - result = data[slice(1)] # scalar - assert isinstance(result, type(data)) - - def test_take_sequence(self, data): - result = pd.Series(data)[[0, 1, 3]] - assert result.iloc[0] == data[0] - assert result.iloc[1] == data[1] - assert result.iloc[2] == data[3] - - -class BaseMissingTests(object): - def test_isna(self, data_missing): - if data_missing._can_hold_na: - expected = np.array([True, False]) - else: - expected = np.array([False, False]) - - result = pd.isna(data_missing) - tm.assert_numpy_array_equal(result, expected) - - result = pd.Series(data_missing).isna() - expected = pd.Series(expected) - tm.assert_series_equal(result, expected) - - def test_dropna_series(self, data_missing): - ser = pd.Series(data_missing) - result = ser.dropna() - expected = ser.iloc[[1]] - tm.assert_series_equal(result, expected) - - def test_dropna_frame(self, data_missing): - df = pd.DataFrame({"A": data_missing}) - - # defaults - result = df.dropna() - expected = df.iloc[[1]] - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.dropna(axis='columns') - expected = pd.DataFrame(index=[0, 1]) - tm.assert_frame_equal(result, expected) - - # multiple - df = pd.DataFrame({"A": data_missing, - "B": [1, np.nan]}) - result = df.dropna() - expected = df.iloc[:0] - tm.assert_frame_equal(result, expected) - - -class BaseMethodsTests(object): - """Various Series and DataFrame methods.""" - - @pytest.mark.parametrize('dropna', [True, False]) - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - - tm.assert_series_equal(result, expected) - - def test_count(self, data_missing): - df = pd.DataFrame({"A": data_missing}) - result = df.count(axis='columns') - expected = pd.Series([0, 1]) - tm.assert_series_equal(result, expected) - - def test_apply_simple_series(self, data): - result = pd.Series(data).apply(id) - assert isinstance(result, pd.Series) - - -class BaseCastingTests(object): - """Casting to and from ExtensionDtypes""" - - def test_astype_object_series(self, all_data): - ser = pd.Series({"A": all_data}) - result = ser.astype(object) - assert isinstance(result._data.blocks[0], ObjectBlock) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py new file mode 100644 index 0000000000000..2273ef1f3e110 --- /dev/null +++ b/pandas/tests/extension/base/__init__.py @@ -0,0 +1,42 @@ +"""Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. +""" +from .casting import BaseCastingTests # noqa +from .constructors import BaseConstructorsTests # noqa +from .dtype import BaseDtypeTests # noqa +from .getitem import BaseGetitemTests # noqa +from .interface import BaseInterfaceTests # noqa +from .methods import BaseMethodsTests # noqa +from .missing import BaseMissingTests # noqa +from .reshaping import BaseReshapingTests # noqa diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py new file mode 100644 index 0000000000000..bcfbf0a247269 --- /dev/null +++ b/pandas/tests/extension/base/casting.py @@ -0,0 +1,11 @@ +import pandas as pd +from pandas.core.internals import ObjectBlock + + +class BaseCastingTests(object): + """Casting to and from ExtensionDtypes""" + + def test_astype_object_series(self, all_data): + ser = pd.Series({"A": all_data}) + result = ser.astype(object) + assert isinstance(result._data.blocks[0], ObjectBlock) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py new file mode 100644 index 0000000000000..7ad100e6289e9 --- /dev/null +++ b/pandas/tests/extension/base/constructors.py @@ -0,0 +1,43 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseConstructorsTests(object): + + def test_series_constructor(self, data): + result = pd.Series(data) + assert result.dtype == data.dtype + assert len(result) == len(data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + assert result._data.blocks[0].values is data + + # Series[EA] is unboxed / boxed correctly + result2 = pd.Series(result) + assert result2.dtype == data.dtype + assert isinstance(result2._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("from_series", [True, False]) + def test_dataframe_constructor_from_dict(self, data, from_series): + if from_series: + data = pd.Series(data) + result = pd.DataFrame({"A": data}) + assert result.dtypes['A'] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_dataframe_from_series(self, data): + result = pd.DataFrame(pd.Series(data)) + assert result.dtypes[0] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_mismatched_index_raises(self, data): + msg = 'Wrong number of items passed 3, placement implies 4' + with tm.assert_raises_regex(ValueError, None) as m: + pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + + assert m.match(msg) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py new file mode 100644 index 0000000000000..f5015bd469f13 --- /dev/null +++ b/pandas/tests/extension/base/dtype.py @@ -0,0 +1,46 @@ +import numpy as np +import pandas as pd + + +class BaseDtypeTests(object): + """Base class for ExtensionDtype classes""" + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set('biufcmMOSUV') + if dtype.kind is not None: + assert dtype.kind in valid + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + def test_is_not_string_type(self, dtype): + return not pd.api.types.is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + return not pd.api.types.is_object_dtype(dtype) + + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + '-suffix' + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype('object') diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py new file mode 100644 index 0000000000000..f43971e928cac --- /dev/null +++ b/pandas/tests/extension/base/getitem.py @@ -0,0 +1,119 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseGetitemTests(object): + """Tests for ExtensionArray.__getitem__.""" + + def test_iloc_series(self, data): + ser = pd.Series(data) + result = ser.iloc[:4] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + def test_loc_series(self, data): + ser = pd.Series(data) + result = ser.loc[:3] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ['A']] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ['A']] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) + + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) + + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + result = data_missing[0] + assert na_cmp(result, na_value) + + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + + def test_getitem_slice(self, data): + # getitem[slice] should return an array + result = data[slice(0)] # empty + assert isinstance(result, type(data)) + + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py new file mode 100644 index 0000000000000..8f17131a9482b --- /dev/null +++ b/pandas/tests/extension/base/interface.py @@ -0,0 +1,53 @@ +import numpy as np + +import pandas as pd +from pandas.compat import StringIO +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class BaseInterfaceTests(object): + """Tests that the basic interface is satisfied.""" + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + + def test_len(self, data): + assert len(data) == 100 + + def test_ndim(self, data): + assert data.ndim == 1 + + def test_can_hold_na_valid(self, data): + assert data._can_hold_na in {True, False} + + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + def test_as_ndarray_with_dtype_kind(self, data): + np.array(data, dtype=data.dtype.kind) + + def test_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result + + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py new file mode 100644 index 0000000000000..c77811ca63926 --- /dev/null +++ b/pandas/tests/extension/base/methods.py @@ -0,0 +1,32 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMethodsTests(object): + """Various Series and DataFrame methods.""" + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis='columns') + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py new file mode 100644 index 0000000000000..1d6f2eea1f1f9 --- /dev/null +++ b/pandas/tests/extension/base/missing.py @@ -0,0 +1,45 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMissingTests(object): + def test_isna(self, data_missing): + if data_missing._can_hold_na: + expected = np.array([True, False]) + else: + expected = np.array([False, False]) + + result = pd.isna(data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + tm.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis='columns') + expected = pd.DataFrame(index=[0, 1]) + tm.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, + "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py new file mode 100644 index 0000000000000..d8f577c6fa50d --- /dev/null +++ b/pandas/tests/extension/base/reshaping.py @@ -0,0 +1,61 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseReshapingTests(object): + """Tests for reshaping and concatenation.""" + @pytest.mark.parametrize('in_frame', [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + + assert len(result) == len(data) * 2 + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_align(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + tm.assert_frame_equal(r1, e1) + tm.assert_frame_equal(r2, e2) + + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df['B'] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({'A': data}) + df['B'] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/extension/category/__init__.py b/pandas/tests/extension/category/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/category/test_categorical.py similarity index 97% rename from pandas/tests/extension/test_categorical.py rename to pandas/tests/extension/category/test_categorical.py index fbec835a72ce8..ec548fca6d901 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -5,7 +5,7 @@ from pandas.api.types import CategoricalDtype from pandas import Categorical -from . import base +from pandas.tests.extension import base def make_data(): diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/decimal/decimal_array.py b/pandas/tests/extension/decimal/decimal_array.py new file mode 100644 index 0000000000000..f526ac5996a10 --- /dev/null +++ b/pandas/tests/extension/decimal/decimal_array.py @@ -0,0 +1,86 @@ +import decimal +import numbers +import random +import sys + +import numpy as np + +import pandas as pd +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype + + +class DecimalDtype(ExtensionDtype): + type = decimal.Decimal + name = 'decimal' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class DecimalArray(ExtensionArray): + dtype = DecimalDtype() + + def __init__(self, values): + values = np.asarray(values, dtype=object) + + self.values = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.values[item] + else: + return type(self)(self.values[item]) + + def copy(self, deep=False): + if deep: + return type(self)(self.values.copy()) + return type(self)(self) + + def __setitem__(self, key, value): + if pd.api.types.is_list_like(value): + value = [decimal.Decimal(v) for v in value] + else: + value = decimal.Decimal(value) + self.values[key] = value + + def __len__(self): + return len(self.values) + + def __repr__(self): + return repr(self.values) + + @property + def nbytes(self): + n = len(self) + if n: + return n * sys.getsizeof(self[0]) + return 0 + + def isna(self): + return np.array([x.is_nan() for x in self.values]) + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + + out = self.values.take(indexer) + out[mask] = self._na_value + + return type(self)(out) + + @property + def _na_value(self): + return decimal.Decimal('NaN') + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([x.values for x in to_concat])) + + +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] diff --git a/pandas/tests/extension/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py similarity index 66% rename from pandas/tests/extension/test_decimal.py rename to pandas/tests/extension/decimal/test_decimal.py index 46ca08f94c72b..9f65ae9a35b6c 100644 --- a/pandas/tests/extension/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -1,93 +1,13 @@ import decimal -import numbers -import random -import sys import numpy as np import pandas as pd import pandas.util.testing as tm import pytest -from pandas.core.arrays import ExtensionArray -from pandas.core.dtypes.base import ExtensionDtype +from pandas.tests.extension import base -from . import base - - -class DecimalDtype(ExtensionDtype): - type = decimal.Decimal - name = 'decimal' - - @classmethod - def construct_from_string(cls, string): - if string == cls.name: - return cls() - else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) - - -class DecimalArray(ExtensionArray): - dtype = DecimalDtype() - - def __init__(self, values): - values = np.asarray(values, dtype=object) - - self.values = values - - def __getitem__(self, item): - if isinstance(item, numbers.Integral): - return self.values[item] - else: - return type(self)(self.values[item]) - - def copy(self, deep=False): - if deep: - return type(self)(self.values.copy()) - return type(self)(self) - - def __setitem__(self, key, value): - if pd.api.types.is_list_like(value): - value = [decimal.Decimal(v) for v in value] - else: - value = decimal.Decimal(value) - self.values[key] = value - - def __len__(self): - return len(self.values) - - def __repr__(self): - return repr(self.values) - - @property - def nbytes(self): - n = len(self) - if n: - return n * sys.getsizeof(self[0]) - return 0 - - def isna(self): - return np.array([x.is_nan() for x in self.values]) - - def take(self, indexer, allow_fill=True, fill_value=None): - mask = indexer == -1 - - out = self.values.take(indexer) - out[mask] = self._na_value - - return type(self)(out) - - @property - def _na_value(self): - return decimal.Decimal('NaN') - - @classmethod - def _concat_same_type(cls, to_concat): - return cls(np.concatenate([x.values for x in to_concat])) - - -def make_data(): - return [decimal.Decimal(random.random()) for _ in range(100)] +from .decimal_array import DecimalDtype, DecimalArray, make_data @pytest.fixture diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/test_json.py b/pandas/tests/extension/json/json_array.py similarity index 72% rename from pandas/tests/extension/test_json.py rename to pandas/tests/extension/json/json_array.py index ecaa36b6db9c9..90aac93c68f64 100644 --- a/pandas/tests/extension/test_json.py +++ b/pandas/tests/extension/json/json_array.py @@ -1,23 +1,15 @@ import collections import itertools import numbers -import operator import random import string import sys import numpy as np -import pytest - from pandas.core.dtypes.base import ExtensionDtype from pandas.core.arrays import ExtensionArray -from . import base - -pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, - reason="Py2 doesn't have a UserDict") - class JSONDtype(ExtensionDtype): type = collections.Mapping @@ -105,64 +97,3 @@ def make_data(): return [collections.UserDict([ (random.choice(string.ascii_letters), random.randint(0, 100)) for _ in range(random.randint(0, 10))]) for _ in range(100)] - - -@pytest.fixture -def dtype(): - return JSONDtype() - - -@pytest.fixture -def data(): - """Length-100 PeriodArray for semantics test.""" - return JSONArray(make_data()) - - -@pytest.fixture -def data_missing(): - """Length 2 array with [NA, Valid]""" - return JSONArray([{}, {'a': 10}]) - - -@pytest.fixture -def na_value(): - return {} - - -@pytest.fixture -def na_cmp(): - return operator.eq - - -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): - pass - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestGetitem(base.BaseGetitemTests): - pass - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="Unhashable") - def test_value_counts(self, all_data, dropna): - pass - - -class TestCasting(base.BaseCastingTests): - pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py new file mode 100644 index 0000000000000..4790dcd973542 --- /dev/null +++ b/pandas/tests/extension/json/test_json.py @@ -0,0 +1,73 @@ +import operator +import sys + +import pytest + + +from pandas.tests.extension import base + +from .json_array import JSONArray, JSONDtype, make_data + +pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, + reason="Py2 doesn't have a UserDict") + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return JSONArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {'a': 10}]) + + +@pytest.fixture +def na_value(): + return {} + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="Unhashable") + def test_value_counts(self, all_data, dropna): + pass + + +class TestCasting(base.BaseCastingTests): + pass From 1ea74dac9eb850a256d0023d5b55b7388681102f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 09:00:14 -0600 Subject: [PATCH 116/119] Revert index change --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec877ec31e49c..1b781be8fa2b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2829,7 +2829,7 @@ def reindexer(value): # turn me into an ndarray value = _sanitize_index(value, self.index, copy=False) - if not isinstance(value, (np.ndarray, Index, ExtensionArray)): + if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) else: From 0c41a341aaff31248a5b34511f1dfbc81b7de65f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 09:00:31 -0600 Subject: [PATCH 117/119] Copy changes --- pandas/core/series.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c9df1d60895d2..87132c5d2f313 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -180,10 +180,11 @@ def __init__(self, data=None, index=None, dtype=None, name=None, name = data.name if dtype is not None: + # astype copies data = data.astype(dtype) - - # need to copy to avoid aliasing issues - data = data._values.copy() + else: + # need to copy to avoid aliasing issues + data = data._values.copy() copy = False elif isinstance(data, np.ndarray): From 009beceb3977652e2bcee443c5216cb54451cabb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 09:17:57 -0600 Subject: [PATCH 118/119] Simplify EA implementation names comments for object vs. str missing values --- pandas/core/dtypes/missing.py | 2 ++ pandas/tests/extension/decimal/{decimal_array.py => array.py} | 0 pandas/tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/json/{json_array.py => array.py} | 0 pandas/tests/extension/json/test_json.py | 2 +- 5 files changed, 4 insertions(+), 2 deletions(-) rename pandas/tests/extension/decimal/{decimal_array.py => array.py} (100%) rename pandas/tests/extension/json/{json_array.py => array.py} (100%) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 36dbb0ee4b98f..01c88c269e7e0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -145,8 +145,10 @@ def _isna_ndarraylike(obj): shape = values.shape if is_string_like_dtype(dtype): + # object array of strings result = np.zeros(values.shape, dtype=bool) else: + # object array of non-strings result = np.empty(shape, dtype=bool) vec = libmissing.isnaobj(values.ravel()) result[...] = vec.reshape(shape) diff --git a/pandas/tests/extension/decimal/decimal_array.py b/pandas/tests/extension/decimal/array.py similarity index 100% rename from pandas/tests/extension/decimal/decimal_array.py rename to pandas/tests/extension/decimal/array.py diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9f65ae9a35b6c..7b4d079ecad87 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -7,7 +7,7 @@ from pandas.tests.extension import base -from .decimal_array import DecimalDtype, DecimalArray, make_data +from .array import DecimalDtype, DecimalArray, make_data @pytest.fixture diff --git a/pandas/tests/extension/json/json_array.py b/pandas/tests/extension/json/array.py similarity index 100% rename from pandas/tests/extension/json/json_array.py rename to pandas/tests/extension/json/array.py diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 4790dcd973542..e0721bb1d8d1a 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -6,7 +6,7 @@ from pandas.tests.extension import base -from .json_array import JSONArray, JSONDtype, make_data +from .array import JSONArray, JSONDtype, make_data pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, reason="Py2 doesn't have a UserDict") From ea5562b99d55062b2bb0a5b359bd5375adb3a46e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Feb 2018 10:07:34 -0600 Subject: [PATCH 119/119] Linting --- pandas/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 87132c5d2f313..12865bfe44a3b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,6 @@ is_iterator, is_dict_like, is_scalar, - is_sparse, _is_unorderable_exception, _ensure_platform_int, pandas_dtype)