From c8d2a80edf8613440f9017ae087f3ff37de840ee Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Thu, 27 Oct 2016 10:33:10 -0700 Subject: [PATCH 01/20] Allow index.map() to accept series and dictionary inputs in addition to functional inputs --- pandas/core/indexes/base.py | 13 ++++++-- pandas/tests/indexes/test_base.py | 46 +++++++++++++++++++++++++++ pandas/tests/indexes/test_category.py | 6 ++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 57d2d07294a53..cb0f6c09176c6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,7 +13,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.core.dtypes.generic import ( ABCSeries, ABCMultiIndex, @@ -2864,7 +2863,7 @@ def map(self, mapper): Parameters ---------- - mapper : callable + mapper : function, dict, or Series Function to be applied. Returns @@ -2876,7 +2875,15 @@ def map(self, mapper): """ from .multi import MultiIndex - mapped_values = self._arrmap(self.values, mapper) + + if isinstance(mapper, ABCSeries): + indexer = mapper.index.get_indexer(self._values) + mapped_values = algos.take_1d(mapper.values, indexer) + else: + if isinstance(mapper, dict): + mapper = mapper.get + mapped_values = self._arrmap(self._values, mapper) + attributes = self._get_attributes_dict() if mapped_values.size and isinstance(mapped_values[0], tuple): return MultiIndex.from_tuples(mapped_values, diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 307cda7f2d1cb..79701bc60ab0a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -829,6 +829,52 @@ def test_map_tseries_indices_return_index(self): exp = Index(range(24), name='hourly') tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) + def test_map_with_series_all_indices(self): + expected = Index(['foo', 'bar', 'baz']) + mapper = Series(expected.values, index=[0, 1, 2]) + self.assert_index_equal(tm.makeIntIndex(3).map(mapper), expected) + + # GH 12766 + # special = [] + special = ['catIndex'] + + for name in special: + orig_values = ['a', 'B', 1, 'a'] + new_values = ['one', 2, 3.0, 'one'] + cur_index = CategoricalIndex(orig_values, name='XXX') + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) + expected = CategoricalIndex(new_values, name='XXX') + output = cur_index.map(mapper) + self.assert_numpy_array_equal(expected.values.get_values(), output.values.get_values()) + self.assert_equal(expected.name, output.name) + + + for name in list(set(self.indices.keys()) - set(special)): + cur_index = self.indices[name] + expected = Index(np.arange(len(cur_index), 0, -1)) + mapper = pd.Series(expected.values, index=cur_index) + print(name) + output = cur_index.map(mapper) + self.assert_index_equal(expected, cur_index.map(mapper)) + + def test_map_with_categorical_series(self): + # GH 12756 + a = Index([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + self.assert_index_equal(a.map(b), exp) + exp = Index(["odd", "even", "odd", np.nan]) + self.assert_index_equal(a.map(c), exp) + + def test_map_with_series_missing_values(self): + # GH 12756 + expected = Index([2., np.nan, 'foo']) + mapper = Series(['foo', 2., 'baz'], index=[0, 2, -1]) + output = Index([2, 1, 0]).map(mapper) + self.assert_index_equal(output, expected) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d8ec23b9c7e0e..9a32d23ed95e6 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -244,6 +244,12 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) + result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) + tm.assert_index_equal(result, exp) + + result = ci.map({'A': 10, 'B': 20, 'C': 30}) + tm.assert_index_equal(result, exp) + def test_where(self): i = self.create_index() result = i.where(notna(i)) From 9a67ffdfc42d35b0f5c6975b87e9c5c1eb4b625d Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Thu, 27 Oct 2016 12:45:11 -0700 Subject: [PATCH 02/20] add test to make sure dictionaries with missing keys work --- pandas/tests/indexes/test_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 79701bc60ab0a..37ae00b54c0fd 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -848,7 +848,6 @@ def test_map_with_series_all_indices(self): self.assert_numpy_array_equal(expected.values.get_values(), output.values.get_values()) self.assert_equal(expected.name, output.name) - for name in list(set(self.indices.keys()) - set(special)): cur_index = self.indices[name] expected = Index(np.arange(len(cur_index), 0, -1)) @@ -868,12 +867,16 @@ def test_map_with_categorical_series(self): exp = Index(["odd", "even", "odd", np.nan]) self.assert_index_equal(a.map(c), exp) - def test_map_with_series_missing_values(self): + def test_map_with_non_function_missing_values(self): # GH 12756 expected = Index([2., np.nan, 'foo']) + input = Index([2, 1, 0]) + mapper = Series(['foo', 2., 'baz'], index=[0, 2, -1]) - output = Index([2, 1, 0]).map(mapper) - self.assert_index_equal(output, expected) + self.assert_index_equal(expected, input.map(mapper)) + + mapper = {0: 'foo', 2: 2.0, -1: 'baz'} + self.assert_index_equal(expected, input.map(mapper)) def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) From 2b70597f92c2588c4cf62029829a5816cc70e7b1 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 7 Jan 2017 11:55:20 -0800 Subject: [PATCH 03/20] Refactor the code to work with period time time delta indices --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/indexes/base.py | 34 ++++++++++++++++++---- pandas/core/indexes/datetimes.py | 24 +++++++++++++++ pandas/core/indexes/period.py | 19 ++++++++++++ pandas/core/indexes/timedeltas.py | 20 +++++++++++++ pandas/core/series.py | 18 ++---------- pandas/tests/indexes/test_base.py | 45 ++++++++++++++++++++--------- pandas/tseries/tests/test_period.py | 0 8 files changed, 125 insertions(+), 36 deletions(-) create mode 100644 pandas/tseries/tests/test_period.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1a7b75266bfdf..22cd32dd7de73 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -519,6 +519,7 @@ Other Enhancements - ``parallel_coordinates()`` has gained a ``sort_labels`` keyword argument that sorts class labels and the colors assigned to them (:issue:`15908`) - Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here ` (:issue:`16157`) - ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`) +- ``Index.map`` can now accept series and dictionary input object (:issue:`12756`). .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cb0f6c09176c6..e722f93027c0a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2820,6 +2820,25 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer + def get_values_from_dict(self, input_dict): + """Return the values of the input dictionary in the order the keys are + in the index. np.nan is returned for index values not in the + dictionary. + + Parameters + ---------- + input_dict : dict + The dictionary from which to extract the values + + Returns + ------- + Union[np.array, list] + + """ + + return lib.fast_multiget(input_dict, self.values, + default=np.nan) + def _maybe_promote(self, other): # A hack, but it works from pandas.core.indexes.datetimes import DatetimeIndex @@ -2863,8 +2882,8 @@ def map(self, mapper): Parameters ---------- - mapper : function, dict, or Series - Function to be applied. + mapper : Union[function, dict, Series] + Function to be applied or input correspondence object. Returns ------- @@ -2877,12 +2896,15 @@ def map(self, mapper): from .multi import MultiIndex if isinstance(mapper, ABCSeries): - indexer = mapper.index.get_indexer(self._values) + indexer = mapper.index.get_indexer(self.values) mapped_values = algos.take_1d(mapper.values, indexer) + elif isinstance(mapper, dict): + idx = Index(mapper.keys()) + data = idx.get_values_from_dict(mapper) + indexer = idx.get_indexer(self.values) + mapped_values = algos.take_1d(data, indexer) else: - if isinstance(mapper, dict): - mapper = mapper.get - mapped_values = self._arrmap(self._values, mapper) + mapped_values = self._arrmap(self.values, mapper) attributes = self._get_attributes_dict() if mapped_values.size and isinstance(mapped_values[0], tuple): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 26e8d12c98660..0637acc03b09c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1409,6 +1409,30 @@ def get_value_maybe_box(self, series, key): key, tz=self.tz) return _maybe_box(self, values, series, key) + def get_values_from_dict(self, input_dict): + """Return the values of the input dictionary in the order the keys are + in the index. np.nan is returned for index values not in the + dictionary. + + Parameters + ---------- + input_dict : dict + The dictionary from which to extract the values + + Returns + ------- + Union[np.array, list] + + """ + if len(input_dict): + # coerce back to datetime objects for lookup + input_dict = com._dict_compat(input_dict) + return lib.fast_multiget(input_dict, + self.asobject.values, + default=np.nan) + else: + return np.nan + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 148ca2725fbdc..c5784f68e51b1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -800,6 +800,25 @@ def _get_unique_index(self, dropna=False): res = res.dropna() return res + def get_values_from_dict(self, input_dict): + """Return the values of the input dictionary in the order the keys are + in the index. np.nan is returned for index values not in the + dictionary. + + Parameters + ---------- + input_dict : dict + The dictionary from which to extract the values + + Returns + ------- + Union[np.array, list] + + """ + + return np.array([input_dict.get(i, np.nan) for i in self.values] + if input_dict else [np.nan]) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9647cef608d4e..e23ab5c9fa156 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -680,6 +680,26 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) + def get_values_from_dict(self, input_dict): + """Return the values of the input dictionary in the order the keys are + in the index. np.nan is returned for index values not in the + dictionary. + + Parameters + ---------- + input_dict : dict + The dictionary from which to extract the values + + Returns + ------- + Union[np.array, list] + + """ + + return np.array([input_dict.get(i, np.nan) + for i in self.asobject.values] + if input_dict else [np.nan]) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c92c4b8850ee..593081e5a594a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -202,23 +202,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, index = Index(data) else: index = Index(_try_sort(data)) + try: - if isinstance(index, DatetimeIndex): - if len(data): - # coerce back to datetime objects for lookup - data = _dict_compat(data) - data = lib.fast_multiget(data, - index.asobject.values, - default=np.nan) - else: - data = np.nan - # GH #12169 - elif isinstance(index, (PeriodIndex, TimedeltaIndex)): - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - else: - data = lib.fast_multiget(data, index.values, - default=np.nan) + data = index.get_values_from_dict(data) except TypeError: data = ([data.get(i, np.nan) for i in index] if data else np.nan) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 37ae00b54c0fd..48d1cdcac4d6b 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -829,10 +829,10 @@ def test_map_tseries_indices_return_index(self): exp = Index(range(24), name='hourly') tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) - def test_map_with_series_all_indices(self): + def test_map_with_dict_and_series(self): expected = Index(['foo', 'bar', 'baz']) mapper = Series(expected.values, index=[0, 1, 2]) - self.assert_index_equal(tm.makeIntIndex(3).map(mapper), expected) + tm.assert_index_equal(tm.makeIntIndex(3).map(mapper), expected) # GH 12766 # special = [] @@ -842,30 +842,47 @@ def test_map_with_series_all_indices(self): orig_values = ['a', 'B', 1, 'a'] new_values = ['one', 2, 3.0, 'one'] cur_index = CategoricalIndex(orig_values, name='XXX') + expected = CategoricalIndex(new_values, + name='XXX', categories=[3.0, 2, 'one']) + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) - expected = CategoricalIndex(new_values, name='XXX') output = cur_index.map(mapper) - self.assert_numpy_array_equal(expected.values.get_values(), output.values.get_values()) - self.assert_equal(expected.name, output.name) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + mapper = {o: n for o, n in + zip(orig_values[:-1], new_values[:-1])} + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) for name in list(set(self.indices.keys()) - set(special)): cur_index = self.indices[name] expected = Index(np.arange(len(cur_index), 0, -1)) - mapper = pd.Series(expected.values, index=cur_index) - print(name) - output = cur_index.map(mapper) - self.assert_index_equal(expected, cur_index.map(mapper)) + mapper = pd.Series(expected, index=cur_index) + tm.assert_index_equal(expected, cur_index.map(mapper)) + + mapper = {o: n for o, n in + zip(cur_index, expected)} + if mapper: + tm.assert_index_equal(expected, cur_index.map(mapper)) + else: + # The expected index type is Int64Index + # but the output defaults to Float64 + tm.assert_index_equal(Float64Index([]), + cur_index.map(mapper)) def test_map_with_categorical_series(self): # GH 12756 a = Index([1, 2, 3, 4]) - b = Series(["even", "odd", "even", "odd"], dtype="category") + b = Series(["even", "odd", "even", "odd"], + dtype="category") c = Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) - self.assert_index_equal(a.map(b), exp) + tm.assert_index_equal(a.map(b), exp) exp = Index(["odd", "even", "odd", np.nan]) - self.assert_index_equal(a.map(c), exp) + tm.assert_index_equal(a.map(c), exp) def test_map_with_non_function_missing_values(self): # GH 12756 @@ -873,10 +890,10 @@ def test_map_with_non_function_missing_values(self): input = Index([2, 1, 0]) mapper = Series(['foo', 2., 'baz'], index=[0, 2, -1]) - self.assert_index_equal(expected, input.map(mapper)) + tm.assert_index_equal(expected, input.map(mapper)) mapper = {0: 'foo', 2: 2.0, -1: 'baz'} - self.assert_index_equal(expected, input.map(mapper)) + tm.assert_index_equal(expected, input.map(mapper)) def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 80ca2e291ff9a20a06713e793a5da07dfba133a6 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 21 Jan 2017 18:55:21 -0800 Subject: [PATCH 04/20] Makes changes based on feedback from @jreback --- pandas/core/indexes/base.py | 17 +++++---- pandas/core/indexes/datetimelike.py | 8 ++++ pandas/core/indexes/datetimes.py | 27 ++++--------- pandas/core/indexes/period.py | 19 ---------- pandas/core/indexes/timedeltas.py | 20 ---------- pandas/core/series.py | 2 +- pandas/tests/indexes/test_base.py | 46 +++++------------------ pandas/tests/indexes/test_category.py | 12 ++++++ pandas/tests/indexing/test_categorical.py | 18 +++++++++ 9 files changed, 65 insertions(+), 104 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e722f93027c0a..214f857b8b31a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2820,23 +2820,25 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer - def get_values_from_dict(self, input_dict): - """Return the values of the input dictionary in the order the keys are + _index_shared_docs['_get_values_from_dict'] = """ + Return the values of the input dictionary in the order the keys are in the index. np.nan is returned for index values not in the dictionary. Parameters ---------- - input_dict : dict + data : dict The dictionary from which to extract the values Returns ------- - Union[np.array, list] + np.array """ - return lib.fast_multiget(input_dict, self.values, + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + return lib.fast_multiget(data, self.values, default=np.nan) def _maybe_promote(self, other): @@ -2882,8 +2884,9 @@ def map(self, mapper): Parameters ---------- - mapper : Union[function, dict, Series] + mapper : {callable, dict, Series} Function to be applied or input correspondence object. + dict and Series support new in 0.20.0. Returns ------- @@ -2900,7 +2903,7 @@ def map(self, mapper): mapped_values = algos.take_1d(mapper.values, indexer) elif isinstance(mapper, dict): idx = Index(mapper.keys()) - data = idx.get_values_from_dict(mapper) + data = idx._get_values_from_dict(mapper) indexer = idx.get_indexer(self.values) mapped_values = algos.take_1d(data, indexer) else: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index cc9361b550c5b..af2358d09d670 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -697,6 +697,14 @@ def __rsub__(self, other): def _add_delta(self, other): return NotImplemented + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + if len(data): + return np.array([data.get(i, np.nan) + for i in self.asobject.values]) + + return np.array([np.nan]) + def _add_delta_td(self, other): # add a delta of a timedeltalike # return the i8 result view diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 0637acc03b09c..88a31cd0789a3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1409,29 +1409,16 @@ def get_value_maybe_box(self, series, key): key, tz=self.tz) return _maybe_box(self, values, series, key) - def get_values_from_dict(self, input_dict): - """Return the values of the input dictionary in the order the keys are - in the index. np.nan is returned for index values not in the - dictionary. - - Parameters - ---------- - input_dict : dict - The dictionary from which to extract the values - - Returns - ------- - Union[np.array, list] - - """ - if len(input_dict): + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + if len(data): # coerce back to datetime objects for lookup - input_dict = com._dict_compat(input_dict) - return lib.fast_multiget(input_dict, + data = com._dict_compat(data) + return lib.fast_multiget(data, self.asobject.values, default=np.nan) - else: - return np.nan + + return np.array([np.nan]) def get_loc(self, key, method=None, tolerance=None): """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c5784f68e51b1..148ca2725fbdc 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -800,25 +800,6 @@ def _get_unique_index(self, dropna=False): res = res.dropna() return res - def get_values_from_dict(self, input_dict): - """Return the values of the input dictionary in the order the keys are - in the index. np.nan is returned for index values not in the - dictionary. - - Parameters - ---------- - input_dict : dict - The dictionary from which to extract the values - - Returns - ------- - Union[np.array, list] - - """ - - return np.array([input_dict.get(i, np.nan) for i in self.values] - if input_dict else [np.nan]) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index e23ab5c9fa156..9647cef608d4e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -680,26 +680,6 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) - def get_values_from_dict(self, input_dict): - """Return the values of the input dictionary in the order the keys are - in the index. np.nan is returned for index values not in the - dictionary. - - Parameters - ---------- - input_dict : dict - The dictionary from which to extract the values - - Returns - ------- - Union[np.array, list] - - """ - - return np.array([input_dict.get(i, np.nan) - for i in self.asobject.values] - if input_dict else [np.nan]) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/series.py b/pandas/core/series.py index 593081e5a594a..daee012eda874 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -204,7 +204,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, index = Index(_try_sort(data)) try: - data = index.get_values_from_dict(data) + data = index._get_values_from_dict(data) except TypeError: data = ([data.get(i, np.nan) for i in index] if data else np.nan) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 48d1cdcac4d6b..e0ab5d3a8ad29 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -830,33 +830,17 @@ def test_map_tseries_indices_return_index(self): tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) def test_map_with_dict_and_series(self): + # GH 12756 expected = Index(['foo', 'bar', 'baz']) mapper = Series(expected.values, index=[0, 1, 2]) - tm.assert_index_equal(tm.makeIntIndex(3).map(mapper), expected) - - # GH 12766 - # special = [] - special = ['catIndex'] - - for name in special: - orig_values = ['a', 'B', 1, 'a'] - new_values = ['one', 2, 3.0, 'one'] - cur_index = CategoricalIndex(orig_values, name='XXX') - expected = CategoricalIndex(new_values, - name='XXX', categories=[3.0, 2, 'one']) - - mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) + result = tm.makeIntIndex(3).map(mapper) + tm.assert_index_equal(result, expected) - mapper = {o: n for o, n in - zip(orig_values[:-1], new_values[:-1])} - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) + for name in self.indices.keys(): + if name == 'catIndex': + # Tested in test_categorical + continue - for name in list(set(self.indices.keys()) - set(special)): cur_index = self.indices[name] expected = Index(np.arange(len(cur_index), 0, -1)) mapper = pd.Series(expected, index=cur_index) @@ -864,26 +848,14 @@ def test_map_with_dict_and_series(self): mapper = {o: n for o, n in zip(cur_index, expected)} + # If the mapper is empty the expected index type is Int64Index + # but the output defaults to Float64 so I treat it independently if mapper: tm.assert_index_equal(expected, cur_index.map(mapper)) else: - # The expected index type is Int64Index - # but the output defaults to Float64 tm.assert_index_equal(Float64Index([]), cur_index.map(mapper)) - def test_map_with_categorical_series(self): - # GH 12756 - a = Index([1, 2, 3, 4]) - b = Series(["even", "odd", "even", "odd"], - dtype="category") - c = Series(["even", "odd", "even", "odd"]) - - exp = CategoricalIndex(["odd", "even", "odd", np.nan]) - tm.assert_index_equal(a.map(b), exp) - exp = Index(["odd", "even", "odd", np.nan]) - tm.assert_index_equal(a.map(c), exp) - def test_map_with_non_function_missing_values(self): # GH 12756 expected = Index([2., np.nan, 'foo']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 9a32d23ed95e6..533d81737c2c8 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -250,6 +250,18 @@ def f(x): result = ci.map({'A': 10, 'B': 20, 'C': 30}) tm.assert_index_equal(result, exp) + def test_map_with_categorical_series(self): + # GH 12756 + a = pd.Index([1, 2, 3, 4]) + b = pd.Series(["even", "odd", "even", "odd"], + dtype="category") + c = pd.Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = pd.Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + def test_where(self): i = self.create_index() result = i.where(notna(i)) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 2c93d2afd1760..22b3fd9073bab 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -439,3 +439,21 @@ def test_indexing_with_category(self): res = (cat[['A']] == 'foo') tm.assert_frame_equal(res, exp) + + def test_map_with_dict_or_series(self): + orig_values = ['a', 'B', 1, 'a'] + new_values = ['one', 2, 3.0, 'one'] + cur_index = pd.CategoricalIndex(orig_values, name='XXX') + expected = pd.CategoricalIndex(new_values, + name='XXX', categories=[3.0, 2, 'one']) + + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + mapper = {o: n for o, n in + zip(orig_values[:-1], new_values[:-1])} + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) From 00165c4fb8c158a2289b88c21700ce0e79a69728 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 22 Apr 2017 11:56:06 -0700 Subject: [PATCH 05/20] Using numpy array from return if possible --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index af2358d09d670..cb34944e93fb3 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -351,7 +351,7 @@ def map(self, f): # Try to use this result if we can if isinstance(result, np.ndarray): - self._shallow_copy(result) + result = Index(result) if not isinstance(result, Index): raise TypeError('The map function must return an Index object') From 2f019c5dee92539c080da079fb022c3ac1ad38a0 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 22 Apr 2017 12:48:19 -0700 Subject: [PATCH 06/20] Update tests to include empty maps and NaT values --- .../indexes/datetimes/test_datetimelike.py | 26 +++++++++++++++++++ pandas/tests/indexes/period/test_period.py | 6 +---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 538e10e6011ec..f98cfb34aa04e 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -76,3 +76,29 @@ def test_union(self): for case in cases: result = first.union(case) assert tm.equalContents(result, everything) + + def test_map(self): + expected = self.index + 1 + tm.assert_index_equal(self.index.map(lambda x: x + 1), expected) + + series_map = pd.Series(expected, self.index) + tm.assert_index_equal(self.index.map(series_map), expected) + + dict_map = {i: e for e, i in zip(expected, self.index)} + tm.assert_index_equal(self.index.map(dict_map), expected) + + # empty mappable + nan_index = Index([pd.np.nan] * len(self.index)) + series_map = pd.Series() + tm.assert_index_equal(self.index.map(series_map), nan_index) + dict_map = {} + tm.assert_index_equal(self.index.map(dict_map), nan_index) + + # map to NaT + result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) + expected = Index([pd.NaT] + self.index[1:].tolist()) + tm.assert_index_equal(result, expected) + series_map = pd.Series(expected, self.index) + tm.assert_index_equal(self.index.map(series_map), expected) + dict_map = {i: e for e, i in zip(expected, self.index)} + tm.assert_index_equal(self.index.map(dict_map), expected) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ae500e66359b4..3a6613b612b17 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -788,12 +788,8 @@ def test_pickle_freq(self): assert new_prng.freq == offsets.MonthEnd() assert new_prng.freqstr == 'M' - def test_map(self): + def test_map_with_ordinal(self): index = PeriodIndex([2005, 2007, 2009], freq='A') - result = index.map(lambda x: x + 1) - expected = index + 1 - tm.assert_index_equal(result, expected) - result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) From f6a24046ce11870478c6a214d2b4e086cdef9f58 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 22 Apr 2017 15:14:38 -0700 Subject: [PATCH 07/20] Refactor map to use common code for series and index when possible and add dict performance test --- asv_bench/benchmarks/series_methods.py | 24 +++++++++++++ pandas/core/base.py | 47 +++++++++++++++++++++++++- pandas/core/indexes/base.py | 37 ++++++++------------ pandas/core/series.py | 40 +++------------------- pandas/tests/indexes/test_base.py | 17 ++++++++++ 5 files changed, 107 insertions(+), 58 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3c0e2869357ae..5e8cf3a0350bb 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -123,6 +123,30 @@ def time_series_dropna_datetime(self): self.s.dropna() +class series_map_dict(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_dict = {i: map_size - i for i in range(map_size)} + + def time_series_map_dict(self): + self.s.map(self.map_dict) + + +class series_map_series(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_series = Series(map_size - np.arange(map_size)) + + def time_series_map_series(self): + self.s.map(self.map_series) + + class series_clip(object): goal_time = 0.2 diff --git a/pandas/core/base.py b/pandas/core/base.py index 19f6728642645..b30c5ec56774a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,7 +16,7 @@ from pandas.util._validators import validate_bool_kwarg -from pandas.core import common as com +from pandas.core import common as com, algorithms import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv @@ -838,6 +838,51 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, klass=self.__class__.__name__, op=name)) return func(**kwds) + def _map_values(self, values, arg, na_action=None): + if is_extension_type(self.dtype): + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) + else: + if na_action == 'ignore': + def map_f(values, f): + return lib.map_infer_mask(values, f, + isnull(values).view(np.uint8)) + else: + map_f = lib.map_infer + + map_values = None + if isinstance(arg, dict): + if hasattr(arg, '__missing__'): + # If a dictionary subclass defines a default value method, + # convert arg to a lookup function (GH #15999). + dict_with_default = arg + arg = lambda x: dict_with_default[x] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an Index for efficiency. + from pandas import Index + idx = Index(arg.keys()) + # Cast to dict so we can get values using lib.fast_multiget + # if this is a dict subclass (GH #15999) + map_values = idx._get_values_from_dict(dict(arg)) + arg = idx + elif isinstance(arg, ABCSeries): + map_values = arg.values + arg = arg.index + + if map_values is not None: + # Since values were input this means we came from either + # a dict or a series and arg should be an index + indexer = arg.get_indexer(values) + new_values = algorithms.take_1d(map_values, indexer) + else: + # arg is a function + new_values = map_f(values, arg) + + return new_values + + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 214f857b8b31a..14bc5323eaad7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2827,7 +2827,7 @@ def get_indexer_for(self, target, **kwargs): Parameters ---------- - data : dict + data : {dict, DictWithoutMissing} The dictionary from which to extract the values Returns @@ -2879,43 +2879,36 @@ def groupby(self, values): return result - def map(self, mapper): - """Apply mapper function to an index. + def map(self, arg, na_action=None): + """Map values of Series using input correspondence (which can be a + dict, Series, or function) Parameters ---------- - mapper : {callable, dict, Series} - Function to be applied or input correspondence object. - dict and Series support new in 0.20.0. + arg : function, dict, or Series + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function Returns ------- - applied : Union[Index, MultiIndex], inferred + applied : {Index, MultiIndex}, inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ - from .multi import MultiIndex - - if isinstance(mapper, ABCSeries): - indexer = mapper.index.get_indexer(self.values) - mapped_values = algos.take_1d(mapper.values, indexer) - elif isinstance(mapper, dict): - idx = Index(mapper.keys()) - data = idx._get_values_from_dict(mapper) - indexer = idx.get_indexer(self.values) - mapped_values = algos.take_1d(data, indexer) - else: - mapped_values = self._arrmap(self.values, mapper) + from .multi import MultiIndex + new_values = super(Index, self)._map_values( + self.values, arg, na_action=na_action) attributes = self._get_attributes_dict() - if mapped_values.size and isinstance(mapped_values[0], tuple): - return MultiIndex.from_tuples(mapped_values, + if new_values.size and isinstance(new_values[0], tuple): + return MultiIndex.from_tuples(new_values, names=attributes.get('name')) attributes['copy'] = False - return Index(mapped_values, **attributes) + return Index(new_values, **attributes) def isin(self, values, level=None): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index daee012eda874..926293fdf29f5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2263,43 +2263,13 @@ def map(self, arg, na_action=None): 3 0 dtype: int64 """ - if is_extension_type(self.dtype): - values = self._values - if na_action is not None: - raise NotImplementedError - map_f = lambda values, f: values.map(f) + input_values = self._values else: - values = self.asobject - - if na_action == 'ignore': - def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) - else: - map_f = lib.map_infer - - if isinstance(arg, dict): - if hasattr(arg, '__missing__'): - # If a dictionary subclass defines a default value method, - # convert arg to a lookup function (GH #15999). - dict_with_default = arg - arg = lambda x: dict_with_default[x] - else: - # Dictionary does not have a default. Thus it's safe to - # convert to an indexed series for efficiency. - arg = self._constructor(arg, index=arg.keys()) - - if isinstance(arg, Series): - # arg is a Series - indexer = arg.index.get_indexer(values) - new_values = algorithms.take_1d(arg._values, indexer) - else: - # arg is a function - new_values = map_f(values, arg) - - return self._constructor(new_values, - index=self.index).__finalize__(self) + input_values = self.asobject + new_values = super(Series, self)._map_values( + input_values, arg, na_action=na_action) + return self._constructor(new_values, index=self.index).__finalize__(self) def _gotitem(self, key, ndim, subset=None): """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e0ab5d3a8ad29..720a48af65676 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -4,6 +4,8 @@ from datetime import datetime, timedelta +from collections import defaultdict + import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex from pandas.tests.indexes.common import Base @@ -867,6 +869,21 @@ def test_map_with_non_function_missing_values(self): mapper = {0: 'foo', 2: 2.0, -1: 'baz'} tm.assert_index_equal(expected, input.map(mapper)) + def test_map_na_exclusion(self): + idx = Index([1.5, np.nan, 3, np.nan, 5]) + + result = idx.map(lambda x: x * 2, na_action='ignore') + exp = idx * 2 + tm.assert_index_equal(result, exp) + + def test_map_defaultdict(self): + idx = Index([1, 2, 3]) + default_dict = defaultdict(lambda: 'blank') + default_dict[1] = 'stuff' + result = idx.map(default_dict) + expected = Index(['stuff', 'blank', 'blank']) + tm.assert_index_equal(result, expected) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) From 0c72a38bb8bdc89c15b79afb506396ba13af654e Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 22 Apr 2017 22:15:06 -0700 Subject: [PATCH 08/20] Fix bug related to converting a dictionary to a MultiIndex instead of an Index --- pandas/core/base.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index b30c5ec56774a..1b987cac595fb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -860,22 +860,15 @@ def map_f(values, f): arg = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to - # convert to an Index for efficiency. - from pandas import Index - idx = Index(arg.keys()) - # Cast to dict so we can get values using lib.fast_multiget - # if this is a dict subclass (GH #15999) - map_values = idx._get_values_from_dict(dict(arg)) - arg = idx - elif isinstance(arg, ABCSeries): - map_values = arg.values - arg = arg.index - - if map_values is not None: + # convert to an Series for efficiency. + from pandas import Series + arg = Series(arg, index=arg.keys()) + + if isinstance(arg, ABCSeries): # Since values were input this means we came from either # a dict or a series and arg should be an index - indexer = arg.get_indexer(values) - new_values = algorithms.take_1d(map_values, indexer) + indexer = arg.index.get_indexer(values) + new_values = algorithms.take_1d(arg._values, indexer) else: # arg is a function new_values = map_f(values, arg) From 73c276b5f7f2f4aada74774ce5cc7c6bd37778f2 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Mon, 24 Apr 2017 10:42:35 -0700 Subject: [PATCH 09/20] pep8 fixes --- pandas/core/base.py | 2 -- pandas/core/series.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 1b987cac595fb..6c405eb9e6daf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -851,7 +851,6 @@ def map_f(values, f): else: map_f = lib.map_infer - map_values = None if isinstance(arg, dict): if hasattr(arg, '__missing__'): # If a dictionary subclass defines a default value method, @@ -875,7 +874,6 @@ def map_f(values, f): return new_values - def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 926293fdf29f5..20ce812173372 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2269,7 +2269,8 @@ def map(self, arg, na_action=None): input_values = self.asobject new_values = super(Series, self)._map_values( input_values, arg, na_action=na_action) - return self._constructor(new_values, index=self.index).__finalize__(self) + return self._constructor(new_values, + index=self.index).__finalize__(self) def _gotitem(self, key, ndim, subset=None): """ From a858467ac9a545953c45ce5e359c7a7eedf9fd85 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sun, 21 May 2017 15:34:07 -0700 Subject: [PATCH 10/20] Address comments from @jreback add new tests; skip tests on IntervalIndex --- pandas/core/base.py | 21 +++++++++++++++ pandas/core/indexes/base.py | 10 +++++-- pandas/tests/indexes/common.py | 26 +++++++++++++++++++ pandas/tests/indexes/datetimelike.py | 23 +++++++++++++++- .../indexes/datetimes/test_datetimelike.py | 25 +----------------- pandas/tests/indexes/period/test_period.py | 4 ++- pandas/tests/indexes/test_interval.py | 4 +++ .../indexes/timedeltas/test_timedelta.py | 1 + 8 files changed, 86 insertions(+), 28 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6c405eb9e6daf..a305b1ae16107 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -839,6 +839,27 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, return func(**kwds) def _map_values(self, values, arg, na_action=None): + """An internal function that maps values using the input + correspondence (which can be a dict, Series, or function). + + Parameters + ---------- + values : np.ndarray + The values to be mapped + arg : function, dict, or Series + The input correspondence object + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function + + Returns + ------- + applied : {Index, MultiIndex}, inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + + """ if is_extension_type(self.dtype): if na_action is not None: raise NotImplementedError diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 14bc5323eaad7..b0650b43e2552 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2827,7 +2827,7 @@ def get_indexer_for(self, target, **kwargs): Parameters ---------- - data : {dict, DictWithoutMissing} + data : dict The dictionary from which to extract the values Returns @@ -2904,8 +2904,14 @@ def map(self, arg, na_action=None): self.values, arg, na_action=na_action) attributes = self._get_attributes_dict() if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None return MultiIndex.from_tuples(new_values, - names=attributes.get('name')) + names=names) attributes['copy'] = False return Index(new_values, **attributes) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 456e5a9bd6439..e40147ced3c3a 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -996,3 +996,29 @@ def test_searchsorted_monotonic(self, indices): # non-monotonic should raise. with pytest.raises(ValueError): indices._searchsorted_monotonic(value, side='left') + + def test_map(self): + index = self.create_index() + # From output of UInt64Index mapping can't infer that we + # shouldn't default to Int64 + if isinstance(index, UInt64Index): + expected = Index(index.values.tolist()) + else: + expected = index + + tm.assert_index_equal(index.map(lambda x: x), expected) + + identity_dict = {x: x for x in index} + tm.assert_index_equal(index.map(identity_dict), expected) + + # Use values to work around MultiIndex instantiation of series + identity_series = Series(expected.values, index=index) + tm.assert_index_equal(index.map(identity_series), expected) + + # empty mappable + nan_index = pd.Index([np.nan] * len(index)) + series_map = pd.Series() + tm.assert_index_equal(index.map(series_map), nan_index) + + dict_map = {} + tm.assert_index_equal(index.map(dict_map), nan_index) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 12b509d4aef3f..3b58181a8bbf9 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,5 +1,5 @@ """ generic datetimelike tests """ - +import pandas as pd from .common import Base import pandas.util.testing as tm @@ -38,3 +38,24 @@ def test_view(self, indices): i_view = i.view(self._holder) result = self._holder(i) tm.assert_index_equal(result, i_view) + + def test_map(self): + expected = self.index + 1 + tm.assert_index_equal(self.index.map(lambda x: x + 1), expected) + + series_map = pd.Series(expected, self.index) + tm.assert_index_equal(self.index.map(series_map), expected) + + dict_map = {i: e for e, i in zip(expected, self.index)} + tm.assert_index_equal(self.index.map(dict_map), expected) + + # map to NaT + result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + tm.assert_index_equal(result, expected) + + series_map = pd.Series(expected, self.index) + tm.assert_index_equal(self.index.map(series_map), expected) + + dict_map = {i: e for e, i in zip(expected, self.index)} + tm.assert_index_equal(self.index.map(dict_map), expected) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index f98cfb34aa04e..a9e015eddf98c 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -78,27 +78,4 @@ def test_union(self): assert tm.equalContents(result, everything) def test_map(self): - expected = self.index + 1 - tm.assert_index_equal(self.index.map(lambda x: x + 1), expected) - - series_map = pd.Series(expected, self.index) - tm.assert_index_equal(self.index.map(series_map), expected) - - dict_map = {i: e for e, i in zip(expected, self.index)} - tm.assert_index_equal(self.index.map(dict_map), expected) - - # empty mappable - nan_index = Index([pd.np.nan] * len(self.index)) - series_map = pd.Series() - tm.assert_index_equal(self.index.map(series_map), nan_index) - dict_map = {} - tm.assert_index_equal(self.index.map(dict_map), nan_index) - - # map to NaT - result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) - expected = Index([pd.NaT] + self.index[1:].tolist()) - tm.assert_index_equal(result, expected) - series_map = pd.Series(expected, self.index) - tm.assert_index_equal(self.index.map(series_map), expected) - dict_map = {i: e for e, i in zip(expected, self.index)} - tm.assert_index_equal(self.index.map(dict_map), expected) + super(TestDatetimeIndex, self).test_map() diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 3a6613b612b17..e953029884243 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -788,7 +788,9 @@ def test_pickle_freq(self): assert new_prng.freq == offsets.MonthEnd() assert new_prng.freqstr == 'M' - def test_map_with_ordinal(self): + def test_map(self): + super(TestPeriodIndex, self).test_map() + index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index b55bab3a210cc..dee1d2bd1d005 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -395,6 +395,10 @@ def test_repr_max_seq_item_setting(self): def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() + @pytest.mark.xfail(reason='get_indexer behavior does not currently work') + def test_map(self): + super(TestIntervalIndex, self).test_map() + def test_get_item(self): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed='right') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 2683110f2f02e..c63dbb091445a 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -300,6 +300,7 @@ def test_misc_coverage(self): assert not idx.equals(list(non_td)) def test_map(self): + super(TestTimedeltaIndex, self).test_map() rng = timedelta_range('1 day', periods=10) From 68479a37202d273eb513067d17c83554bf9a9397 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sun, 21 May 2017 16:08:12 -0700 Subject: [PATCH 11/20] Fix issues from merge --- pandas/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 20ce812173372..51e738a0fa759 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -44,7 +44,6 @@ _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, - _dict_compat, standardize_mapping, _any_none) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, From 80a4c9cd11814cb0ca2fc09a6197bd17ed07b2a1 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Mon, 30 Oct 2017 21:21:59 -0700 Subject: [PATCH 12/20] Fix issues from merge --- pandas/core/base.py | 7 ++++--- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/test_base.py | 3 +++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index a305b1ae16107..e4fa3c900c4be 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -12,7 +12,8 @@ is_object_dtype, is_list_like, is_scalar, - is_datetimelike) + is_datetimelike, + is_extension_type) from pandas.util._validators import validate_bool_kwarg @@ -854,7 +855,7 @@ def _map_values(self, values, arg, na_action=None): Returns ------- - applied : {Index, MultiIndex}, inferred + applied : Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. @@ -868,7 +869,7 @@ def _map_values(self, values, arg, na_action=None): if na_action == 'ignore': def map_f(values, f): return lib.map_infer_mask(values, f, - isnull(values).view(np.uint8)) + isna(values).view(np.uint8)) else: map_f = lib.map_infer diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b0650b43e2552..42f23c5e5c0c2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2892,7 +2892,7 @@ def map(self, arg, na_action=None): Returns ------- - applied : {Index, MultiIndex}, inferred + applied : Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. @@ -2914,7 +2914,7 @@ def map(self, arg, na_action=None): names=names) attributes['copy'] = False - return Index(new_values, **attributes) + return self._constructor(new_values, **attributes) def isin(self, values, level=None): """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 720a48af65676..2275b195a0d9a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -842,6 +842,9 @@ def test_map_with_dict_and_series(self): if name == 'catIndex': # Tested in test_categorical continue + elif name == 'repeats': + # Cannot map duplicated index + continue cur_index = self.indices[name] expected = Index(np.arange(len(cur_index), 0, -1)) From 30e7e7a457c7059a2f02332f0906f678eeaadd1f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Nov 2017 08:20:48 -0400 Subject: [PATCH 13/20] replace _constructor --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 42f23c5e5c0c2..276f2737966fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2914,7 +2914,7 @@ def map(self, arg, na_action=None): names=names) attributes['copy'] = False - return self._constructor(new_values, **attributes) + return Index(new_values, **attributes) def isin(self, values, level=None): """ From 51c5b2b4eab2bc2f770379605c9d342eae9333c7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 19 Nov 2017 14:16:42 -0500 Subject: [PATCH 14/20] move whatsnew --- doc/source/whatsnew/v0.20.0.txt | 1 - doc/source/whatsnew/v0.22.0.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 60d050c8bb99b..fc869956c820e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -519,7 +519,6 @@ Other Enhancements - ``parallel_coordinates()`` has gained a ``sort_labels`` keyword argument that sorts class labels and the colors assigned to them (:issue:`15908`) - Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here ` (:issue:`16157`) - ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`) -- ``Index.map`` can now accept series and dictionary input object (:issue:`12756`). .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6dc730cae37f7..81bc90e5dd9e8 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -48,6 +48,7 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`). .. _whatsnew_0220.deprecations: From 8f0198ec23197559163ba53ad5b81fdc8a3033b1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 19 Nov 2017 14:37:57 -0500 Subject: [PATCH 15/20] unify _map_values a bit more --- pandas/core/base.py | 7 ++++--- pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 6 +----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9589a63977287..efc07c2adf237 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -839,14 +839,12 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, klass=self.__class__.__name__, op=name)) return func(**kwds) - def _map_values(self, values, arg, na_action=None): + def _map_values(self, arg, na_action=None): """An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- - values : np.ndarray - The values to be mapped arg : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} @@ -862,10 +860,13 @@ def _map_values(self, values, arg, na_action=None): """ if is_extension_type(self.dtype): + values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: + values = self.astype(object) + values = getattr(values, 'values', values) if na_action == 'ignore': def map_f(values, f): return lib.map_infer_mask(values, f, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a93d2742a5e42..00c17285f61ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2901,7 +2901,7 @@ def map(self, arg, na_action=None): from .multi import MultiIndex new_values = super(Index, self)._map_values( - self.values, arg, na_action=na_action) + arg, na_action=na_action) attributes = self._get_attributes_dict() if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): diff --git a/pandas/core/series.py b/pandas/core/series.py index a8521ab36ce60..431bfa404ff08 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2322,12 +2322,8 @@ def map(self, arg, na_action=None): 3 0 dtype: int64 """ - if is_extension_type(self.dtype): - input_values = self._values - else: - input_values = self.asobject new_values = super(Series, self)._map_values( - input_values, arg, na_action=na_action) + arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) From 80fad281df45030596eab9ae6ca6a352e8695780 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 19 Nov 2017 14:46:27 -0500 Subject: [PATCH 16/20] use an efficient path for mapping --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/base.py | 43 +++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 81bc90e5dd9e8..0c33202539e98 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -78,7 +78,7 @@ Performance Improvements - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) -- +- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) .. _whatsnew_0220.docs: diff --git a/pandas/core/base.py b/pandas/core/base.py index efc07c2adf237..8776b599386f8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -859,21 +859,10 @@ def _map_values(self, arg, na_action=None): a MultiIndex will be returned. """ - if is_extension_type(self.dtype): - values = self._values - if na_action is not None: - raise NotImplementedError - map_f = lambda values, f: values.map(f) - else: - values = self.astype(object) - values = getattr(values, 'values', values) - if na_action == 'ignore': - def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) - else: - map_f = lib.map_infer + # we can fastpath dict/Series to an efficient map + # as we know that we are not going to have to yield + # python types if isinstance(arg, dict): if hasattr(arg, '__missing__'): # If a dictionary subclass defines a default value method, @@ -889,11 +878,33 @@ def map_f(values, f): if isinstance(arg, ABCSeries): # Since values were input this means we came from either # a dict or a series and arg should be an index + if is_extension_type(self.dtype): + values = self._values + else: + values = self.values + indexer = arg.index.get_indexer(values) new_values = algorithms.take_1d(arg._values, indexer) + return new_values + + # we must convert to python types + if is_extension_type(self.dtype): + values = self._values + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) else: - # arg is a function - new_values = map_f(values, arg) + values = self.astype(object) + values = getattr(values, 'values', values) + if na_action == 'ignore': + def map_f(values, f): + return lib.map_infer_mask(values, f, + isna(values).view(np.uint8)) + else: + map_f = lib.map_infer + + # arg is a function + new_values = map_f(values, arg) return new_values From 5006b244462b217dacefb37d7653b4ce94d5528f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 19 Nov 2017 18:05:06 -0500 Subject: [PATCH 17/20] restore name of mapper in .map --- pandas/core/indexes/base.py | 6 +++--- pandas/core/series.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 00c17285f61ff..6ffa9b3510ae9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2879,13 +2879,13 @@ def groupby(self, values): return result - def map(self, arg, na_action=None): + def map(self, mapper, na_action=None): """Map values of Series using input correspondence (which can be a dict, Series, or function) Parameters ---------- - arg : function, dict, or Series + mapper : function, dict, or Series na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function @@ -2901,7 +2901,7 @@ def map(self, arg, na_action=None): from .multi import MultiIndex new_values = super(Index, self)._map_values( - arg, na_action=na_action) + mapper, na_action=na_action) attributes = self._get_attributes_dict() if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): diff --git a/pandas/core/series.py b/pandas/core/series.py index 431bfa404ff08..b99b981d1f9be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2230,14 +2230,14 @@ def unstack(self, level=-1, fill_value=None): # ---------------------------------------------------------------------- # function application - def map(self, arg, na_action=None): + def map(self, mapper, na_action=None): """ Map values of Series using input correspondence (which can be a dict, Series, or function) Parameters ---------- - arg : function, dict, or Series + mapper : function, dict, or Series na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function @@ -2270,7 +2270,7 @@ def map(self, arg, na_action=None): two bar three baz - If `arg` is a dictionary, return a new Series with values converted + If `mapper` is a dictionary, return a new Series with values converted according to the dictionary's mapping: >>> z = {1: 'A', 2: 'B', 3: 'C'} @@ -2307,7 +2307,7 @@ def map(self, arg, na_action=None): Notes ----- - When `arg` is a dictionary, values in Series that are not in the + When `mapper` is a dictionary, values in Series that are not in the dictionary (as keys) are converted to ``NaN``. However, if the dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. provides a method for default values), then this default is used @@ -2323,7 +2323,7 @@ def map(self, arg, na_action=None): dtype: int64 """ new_values = super(Series, self)._map_values( - arg, na_action=na_action) + mapper, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) From 41346417b49901eb0a5cca38924ad880c13bd663 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 19 Nov 2017 18:11:26 -0500 Subject: [PATCH 18/20] more renaming --- pandas/core/base.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8776b599386f8..c2018900645ec 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -839,13 +839,13 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, klass=self.__class__.__name__, op=name)) return func(**kwds) - def _map_values(self, arg, na_action=None): + def _map_values(self, mapper, na_action=None): """An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- - arg : function, dict, or Series + mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the @@ -863,28 +863,28 @@ def _map_values(self, arg, na_action=None): # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types - if isinstance(arg, dict): - if hasattr(arg, '__missing__'): + if isinstance(mapper, dict): + if hasattr(mapper, '__missing__'): # If a dictionary subclass defines a default value method, - # convert arg to a lookup function (GH #15999). - dict_with_default = arg - arg = lambda x: dict_with_default[x] + # convert mapper to a lookup function (GH #15999). + dict_with_default = mapper + mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. from pandas import Series - arg = Series(arg, index=arg.keys()) + mapper = Series(mapper, index=mapper.keys()) - if isinstance(arg, ABCSeries): + if isinstance(mapper, ABCSeries): # Since values were input this means we came from either - # a dict or a series and arg should be an index + # a dict or a series and mapper should be an index if is_extension_type(self.dtype): values = self._values else: values = self.values - indexer = arg.index.get_indexer(values) - new_values = algorithms.take_1d(arg._values, indexer) + indexer = mapper.index.get_indexer(values) + new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types @@ -903,8 +903,8 @@ def map_f(values, f): else: map_f = lib.map_infer - # arg is a function - new_values = map_f(values, arg) + # mapper is a function + new_values = map_f(values, mapper) return new_values From c0f3b767cab71505c779905c298533b6c5089a5e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 Nov 2017 18:06:53 -0500 Subject: [PATCH 19/20] review comments --- doc/source/whatsnew/v0.22.0.txt | 3 -- pandas/core/base.py | 2 ++ pandas/core/indexes/base.py | 6 ++-- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/series.py | 10 +++---- pandas/tests/indexes/datetimelike.py | 41 +++++++++++++++++++-------- pandas/tests/indexes/test_category.py | 1 - pandas/tests/series/test_apply.py | 1 + pandas/tseries/tests/test_period.py | 0 9 files changed, 42 insertions(+), 24 deletions(-) delete mode 100644 pandas/tseries/tests/test_period.py diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 3aa6a03e5ef20..f97b958d553e0 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -109,14 +109,11 @@ Performance Improvements - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) -<<<<<<< HEAD - Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) -======= - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) - ->>>>>>> master .. _whatsnew_0220.docs: diff --git a/pandas/core/base.py b/pandas/core/base.py index c2018900645ec..2820e7287ad6c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -872,6 +872,8 @@ def _map_values(self, mapper, na_action=None): else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. + # we specify the keys here to handle the + # possibility that they are tuples from pandas import Series mapper = Series(mapper, index=mapper.keys()) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bdf959910a51c..8a751f0204b60 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2886,8 +2886,7 @@ def groupby(self, values): return result def map(self, mapper, na_action=None): - """Map values of Series using input correspondence (which can be a - dict, Series, or function) + """Map values of Series using input correspondence Parameters ---------- @@ -2920,6 +2919,9 @@ def map(self, mapper, na_action=None): names=names) attributes['copy'] = False + + # we infer the result types based on the + # returned values return Index(new_values, **attributes) def isin(self, values, level=None): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5b3e7f0e7fbfd..5643d886a4fec 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -136,7 +136,7 @@ def equals(self, other): elif not isinstance(other, type(self)): try: other = type(self)(other) - except: + except Exception: return False if not is_dtype_equal(self.dtype, other.dtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index 932367692090f..ff0c8ac34eea5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2231,14 +2231,14 @@ def unstack(self, level=-1, fill_value=None): # ---------------------------------------------------------------------- # function application - def map(self, mapper, na_action=None): + def map(self, arg, na_action=None): """ Map values of Series using input correspondence (which can be a dict, Series, or function) Parameters ---------- - mapper : function, dict, or Series + arg : function, dict, or Series na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function @@ -2271,7 +2271,7 @@ def map(self, mapper, na_action=None): two bar three baz - If `mapper` is a dictionary, return a new Series with values converted + If `arg` is a dictionary, return a new Series with values converted according to the dictionary's mapping: >>> z = {1: 'A', 2: 'B', 3: 'C'} @@ -2308,7 +2308,7 @@ def map(self, mapper, na_action=None): Notes ----- - When `mapper` is a dictionary, values in Series that are not in the + When `arg` is a dictionary, values in Series that are not in the dictionary (as keys) are converted to ``NaN``. However, if the dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. provides a method for default values), then this default is used @@ -2324,7 +2324,7 @@ def map(self, mapper, na_action=None): dtype: int64 """ new_values = super(Series, self)._map_values( - mapper, na_action=na_action) + arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 3b58181a8bbf9..839fccc1441e5 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,5 +1,7 @@ """ generic datetimelike tests """ +import pytest import pandas as pd +import numpy as np from .common import Base import pandas.util.testing as tm @@ -39,23 +41,38 @@ def test_view(self, indices): result = self._holder(i) tm.assert_index_equal(result, i_view) - def test_map(self): - expected = self.index + 1 - tm.assert_index_equal(self.index.map(lambda x: x + 1), expected) - - series_map = pd.Series(expected, self.index) - tm.assert_index_equal(self.index.map(series_map), expected) + def test_map_callable(self): - dict_map = {i: e for e, i in zip(expected, self.index)} - tm.assert_index_equal(self.index.map(dict_map), expected) + expected = self.index + 1 + result = self.index.map(lambda x: x + 1) + tm.assert_index_equal(result, expected) # map to NaT result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) expected = pd.Index([pd.NaT] + self.index[1:].tolist()) tm.assert_index_equal(result, expected) - series_map = pd.Series(expected, self.index) - tm.assert_index_equal(self.index.map(series_map), expected) + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): + expected = self.index + 1 + + # don't compare the freqs + if isinstance(expected, pd.DatetimeIndex): + expected.freq = None + + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) + + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) - dict_map = {i: e for e, i in zip(expected, self.index)} - tm.assert_index_equal(self.index.map(dict_map), expected) + # empty map; these map to np.nan because we cannot know + # to re-infer things + expected = pd.Index([np.nan] * len(self.index)) + result = self.index.map(mapper([], [])) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index ae4e86faaa6ac..92d5a53f6570b 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -269,7 +269,6 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) -<<<<<<< HEAD result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) tm.assert_index_equal(result, exp) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index d0693984689a6..fe21ba569ae99 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -424,6 +424,7 @@ def test_map_dict_with_tuple_keys(self): """ df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df['labels'] = df['a'].map(label_mappings) df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) # All labels should be filled now diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From dd0b7e9f07e38abe5a3a21ba81f825256a1c209e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 Nov 2017 18:40:56 -0500 Subject: [PATCH 20/20] handle empty maps --- pandas/core/base.py | 1 + pandas/core/dtypes/cast.py | 35 +++++++++++++++++++ pandas/core/dtypes/missing.py | 7 ++-- pandas/core/series.py | 27 ++++---------- pandas/tests/indexes/common.py | 1 + .../indexes/datetimes/test_datetimelike.py | 3 -- pandas/tests/indexes/period/test_period.py | 2 +- pandas/tests/indexes/test_base.py | 18 +++++----- .../indexes/timedeltas/test_timedelta.py | 2 +- 9 files changed, 61 insertions(+), 35 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 2820e7287ad6c..cce0f384cb983 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -887,6 +887,7 @@ def _map_values(self, mapper, na_action=None): indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) + return new_values # we must convert to python types diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bc8aacfe90170..a97b84ab9cc5b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1127,3 +1127,38 @@ def cast_scalar_to_array(shape, value, dtype=None): values.fill(fill_value) return values + + +def construct_1d_arraylike_from_scalar(value, length, dtype): + """ + create a np.ndarray / pandas type of specified shape and dtype + filled with values + + Parameters + ---------- + value : scalar value + length : int + dtype : pandas_dtype / np.dtype + + Returns + ------- + np.ndarray / pandas type of length, filled with value + + """ + if is_datetimetz(dtype): + from pandas import DatetimeIndex + subarr = DatetimeIndex([value] * length, dtype=dtype) + elif is_categorical_dtype(dtype): + from pandas import Categorical + subarr = Categorical([value] * length) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + + # coerce if we have nan for an integer dtype + if is_integer_dtype(dtype) and isna(value): + dtype = np.float64 + subarr = np.empty(length, dtype=dtype) + subarr.fill(value) + + return subarr diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7cae536c5edd9..ce57b544d9d66 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -369,13 +369,14 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype): +def na_value_for_dtype(dtype, compat=True): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype + compat : boolean, default True Returns ------- @@ -389,7 +390,9 @@ def na_value_for_dtype(dtype): elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): - return 0 + if compat: + return 0 + return np.nan elif is_bool_dtype(dtype): return False return np.nan diff --git a/pandas/core/series.py b/pandas/core/series.py index ff0c8ac34eea5..bff7c21ad69b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -18,7 +18,7 @@ is_bool, is_integer, is_integer_dtype, is_float_dtype, - is_extension_type, is_datetimetz, + is_extension_type, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -34,7 +34,8 @@ from pandas.core.dtypes.cast import ( maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, - maybe_cast_to_datetime, maybe_castable) + maybe_cast_to_datetime, maybe_castable, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.common import (is_bool_indexer, @@ -3200,21 +3201,6 @@ def _try_cast(arr, take_fast_path): else: subarr = _try_cast(data, False) - def create_from_value(value, index, dtype): - # return a new empty value suitable for the dtype - - if is_datetimetz(dtype): - subarr = DatetimeIndex([value] * len(index), dtype=dtype) - elif is_categorical_dtype(dtype): - subarr = Categorical([value] * len(index)) - else: - if not isinstance(dtype, (np.dtype, type(np.dtype))): - dtype = dtype.dtype - subarr = np.empty(len(index), dtype=dtype) - subarr.fill(value) - - return subarr - # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover @@ -3229,7 +3215,8 @@ def create_from_value(value, index, dtype): # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) - subarr = create_from_value(value, index, dtype) + subarr = construct_1d_arraylike_from_scalar( + value, len(index), dtype) else: return subarr.item() @@ -3240,8 +3227,8 @@ def create_from_value(value, index, dtype): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - subarr = create_from_value(subarr[0], index, - subarr.dtype) + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2fe1ee3ef97e5..ee6434431bcfc 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1008,6 +1008,7 @@ def test_searchsorted_monotonic(self, indices): def test_map(self): index = self.create_index() + # From output of UInt64Index mapping can't infer that we # shouldn't default to Int64 if isinstance(index, UInt64Index): diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index a9e015eddf98c..538e10e6011ec 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -76,6 +76,3 @@ def test_union(self): for case in cases: result = first.union(case) assert tm.equalContents(result, everything) - - def test_map(self): - super(TestDatetimeIndex, self).test_map() diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 978e6612caaad..9d5746e07814e 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -683,7 +683,7 @@ def test_pickle_freq(self): assert new_prng.freqstr == 'M' def test_map(self): - super(TestPeriodIndex, self).test_map() + # test_map_dictlike generally tests index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x.ordinal) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 86fdb89a099d7..f5016e6d19a57 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -864,17 +864,19 @@ def test_map_with_dict_and_series(self): cur_index = self.indices[name] expected = Index(np.arange(len(cur_index), 0, -1)) mapper = pd.Series(expected, index=cur_index) - tm.assert_index_equal(expected, cur_index.map(mapper)) + result = cur_index.map(mapper) + + tm.assert_index_equal(result, expected) - mapper = {o: n for o, n in - zip(cur_index, expected)} # If the mapper is empty the expected index type is Int64Index # but the output defaults to Float64 so I treat it independently - if mapper: - tm.assert_index_equal(expected, cur_index.map(mapper)) - else: - tm.assert_index_equal(Float64Index([]), - cur_index.map(mapper)) + mapper = {o: n for o, n in + zip(cur_index, expected)} + + result = cur_index.map(mapper) + if not mapper: + expected = Float64Index([]) + tm.assert_index_equal(result, expected) def test_map_with_non_function_missing_values(self): # GH 12756 diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index a5fdc3819c5b0..e25384ebf7d62 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -187,7 +187,7 @@ def test_misc_coverage(self): assert not idx.equals(list(non_td)) def test_map(self): - super(TestTimedeltaIndex, self).test_map() + # test_map_dictlike generally tests rng = timedelta_range('1 day', periods=10)