diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 742077d39fb18..02ef2bbed19b6 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -73,6 +73,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) @@ -100,3 +101,6 @@ Bug Fixes - Bug that caused segfault when resampling an empty Series (:issue:`10228`) +- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`) + + diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c97d459fb96df..c958a70b43089 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -202,6 +202,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, from pandas.tools.tile import cut from pandas.tseries.period import PeriodIndex + name = getattr(values, 'name', None) values = Series(values).values if bins is not None: @@ -222,7 +223,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if com.is_datetime_or_timedelta_dtype(dtype) or is_period: if is_period: - values = PeriodIndex(values) + values = PeriodIndex(values, name=name) values = values.view(np.int64) keys, counts = htable.value_count_int64(values) @@ -247,7 +248,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) - result = Series(counts, index=com._values_from_object(keys)) + result = Series(counts, index=com._values_from_object(keys), name=name) if bins is not None: # TODO: This next line should be more efficient diff --git a/pandas/core/base.py b/pandas/core/base.py index 540b900844a9e..c3004aec60cc5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -431,10 +431,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if isinstance(self, PeriodIndex): # preserve freq - result.index = self._simple_new(result.index.values, self.name, + result.index = self._simple_new(result.index.values, freq=self.freq) elif isinstance(self, DatetimeIndex): - result.index = self._simple_new(result.index.values, self.name, + result.index = self._simple_new(result.index.values, tz=getattr(self, 'tz', None)) return result diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index e9526f9fad1ac..cd60bafdd30cf 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -181,23 +181,24 @@ def f(): class Ops(tm.TestCase): def setUp(self): - self.bool_index = tm.makeBoolIndex(10) - self.int_index = tm.makeIntIndex(10) - self.float_index = tm.makeFloatIndex(10) - self.dt_index = tm.makeDateIndex(10) - self.dt_tz_index = tm.makeDateIndex(10).tz_localize(tz='US/Eastern') - self.period_index = tm.makePeriodIndex(10) - self.string_index = tm.makeStringIndex(10) + self.bool_index = tm.makeBoolIndex(10, name='a') + self.int_index = tm.makeIntIndex(10, name='a') + self.float_index = tm.makeFloatIndex(10, name='a') + self.dt_index = tm.makeDateIndex(10, name='a') + self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern') + self.period_index = tm.makePeriodIndex(10, name='a') + self.string_index = tm.makeStringIndex(10, name='a') + self.unicode_index = tm.makeUnicodeIndex(10, name='a') arr = np.random.randn(10) - self.int_series = Series(arr, index=self.int_index) - self.float_series = Series(arr, index=self.float_index) - self.dt_series = Series(arr, index=self.dt_index) + self.int_series = Series(arr, index=self.int_index, name='a') + self.float_series = Series(arr, index=self.float_index, name='a') + self.dt_series = Series(arr, index=self.dt_index, name='a') self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) - self.period_series = Series(arr, index=self.period_index) - self.string_series = Series(arr, index=self.string_index) + self.period_series = Series(arr, index=self.period_index, name='a') + self.string_series = Series(arr, index=self.string_index, name='a') - types = ['bool','int','float','dt', 'dt_tz', 'period','string'] + types = ['bool','int','float','dt', 'dt_tz', 'period','string', 'unicode'] fmts = [ "{0}_{1}".format(t,f) for t in types for f in ['index','series'] ] self.objs = [ getattr(self,f) for f in fmts if getattr(self,f,None) is not None ] @@ -213,9 +214,9 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): try: if isinstance(o, Series): - expected = Series(getattr(o.index,op),index=o.index) + expected = Series(getattr(o.index,op), index=o.index, name='a') else: - expected = getattr(o,op) + expected = getattr(o, op) except (AttributeError): if ignore_failures: continue @@ -361,21 +362,28 @@ def test_value_counts_unique_nunique(self): # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq ambiguous - expected_index = o[::-1] - o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + + # resets name from Index + expected_index = pd.Index(o[::-1], name=None) + + # attach name to klass + o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a') # don't test boolean elif isinstance(o,Index) and o.is_boolean(): continue elif isinstance(o, Index): - expected_index = values[::-1] - o = klass(np.repeat(values, range(1, len(o) + 1))) + expected_index = pd.Index(values[::-1], name=None) + o = klass(np.repeat(values, range(1, len(o) + 1)), name='a') else: - expected_index = values[::-1] + expected_index = pd.Index(values[::-1], name=None) idx = np.repeat(o.index.values, range(1, len(o) + 1)) - o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx) + o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a') - expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64') - tm.assert_series_equal(o.value_counts(), expected_s) + expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') + result = o.value_counts() + tm.assert_series_equal(result, expected_s) + self.assertTrue(result.index.name is None) + self.assertEqual(result.name, 'a') result = o.unique() if isinstance(o, (DatetimeIndex, PeriodIndex)): @@ -410,21 +418,34 @@ def test_value_counts_unique_nunique(self): # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq ambiguous - expected_index = o - o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + + # resets name from Index + expected_index = pd.Index(o, name=None) + # attach name to klass + o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a') elif isinstance(o, Index): - expected_index = values - o = klass(np.repeat(values, range(1, len(o) + 1))) + expected_index = pd.Index(values, name=None) + o = klass(np.repeat(values, range(1, len(o) + 1)), name='a') else: - expected_index = values + expected_index = pd.Index(values, name=None) idx = np.repeat(o.index.values, range(1, len(o) + 1)) - o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx) - - expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64') - expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64') - - tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na) + o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a') + + expected_s_na = Series(list(range(10, 2, -1)) +[3], + index=expected_index[9:0:-1], + dtype='int64', name='a') + expected_s = Series(list(range(10, 2, -1)), + index=expected_index[9:1:-1], + dtype='int64', name='a') + + result_s_na = o.value_counts(dropna=False) + tm.assert_series_equal(result_s_na, expected_s_na) + self.assertTrue(result_s_na.index.name is None) + self.assertEqual(result_s_na.name, 'a') + result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) + self.assertTrue(result_s.index.name is None) + self.assertEqual(result_s.name, 'a') # numpy_array_equal cannot compare arrays includes nan result = o.unique() @@ -508,14 +529,15 @@ def test_value_counts_inferred(self): df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) - s = klass(df['dt'].copy()) + s = klass(df['dt'].copy(), name='dt') - idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) - expected_s = Series([3, 2, 1], index=idx) + idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', + '2009-01-01 00:00:00X']) + expected_s = Series([3, 2, 1], index=idx, name='dt') tm.assert_series_equal(s.value_counts(), expected_s) - expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], - dtype='datetime64[ns]') + expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', + '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) @@ -526,7 +548,7 @@ def test_value_counts_inferred(self): # with NaT s = df['dt'].copy() - s = klass([v for v in s.values] + [pd.NaT]) + s = klass([v for v in s.values] + [pd.NaT], name='dt') result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') @@ -547,10 +569,10 @@ def test_value_counts_inferred(self): # timedelta64[ns] td = df.dt - df.dt + timedelta(1) - td = klass(td) + td = klass(td, name='dt') result = td.value_counts() - expected_s = Series([6], index=[Timedelta('1day')]) + expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) @@ -560,9 +582,8 @@ def test_value_counts_inferred(self): self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) - td2 = klass(td2) + td2 = klass(td2, name='dt') result2 = td2.value_counts() - tm.assert_series_equal(result2, expected_s) def test_factorize(self): @@ -629,7 +650,7 @@ def test_duplicated_drop_duplicates(self): # special case if original.is_boolean(): result = original.drop_duplicates() - expected = Index([False,True]) + expected = Index([False,True], name='a') tm.assert_index_equal(result, expected) continue @@ -668,7 +689,8 @@ def test_duplicated_drop_duplicates(self): idx.drop_duplicates(inplace=True) else: - expected = Series([False] * len(original), index=original.index) + expected = Series([False] * len(original), + index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) @@ -676,17 +698,17 @@ def test_duplicated_drop_duplicates(self): idx = original.index[list(range(len(original))) + [5, 3]] values = original.values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx) + s = Series(values, index=idx, name='a') - expected = Series([False] * len(original) + [True, True], index=idx) + expected = Series([False] * len(original) + [True, True], + index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True - expected = Series(last_base, index=idx) - expected + expected = Series(last_base, index=idx, name='a') tm.assert_series_equal(s.duplicated(take_last=True), expected) tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(last_base)]) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 83d6b97788e91..04e868a4a0819 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -817,43 +817,43 @@ def getArangeMat(): # make index -def makeStringIndex(k=10): - return Index(rands_array(nchars=10, size=k)) +def makeStringIndex(k=10, name=None): + return Index(rands_array(nchars=10, size=k), name=name) -def makeUnicodeIndex(k=10): +def makeUnicodeIndex(k=10, name=None): return Index(randu_array(nchars=10, size=k)) -def makeCategoricalIndex(k=10, n=3): +def makeCategoricalIndex(k=10, n=3, name=None): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x,k)) + return CategoricalIndex(np.random.choice(x,k), name=name) -def makeBoolIndex(k=10): +def makeBoolIndex(k=10, name=None): if k == 1: - return Index([True]) + return Index([True], name=name) elif k == 2: - return Index([False,True]) - return Index([False,True] + [False]*(k-2)) + return Index([False,True], name=name) + return Index([False,True] + [False]*(k-2), name=name) -def makeIntIndex(k=10): - return Index(lrange(k)) +def makeIntIndex(k=10, name=None): + return Index(lrange(k), name=name) -def makeFloatIndex(k=10): +def makeFloatIndex(k=10, name=None): values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) - return Index(values * (10 ** np.random.randint(0, 9))) + return Index(values * (10 ** np.random.randint(0, 9)), name=name) -def makeDateIndex(k=10, freq='B'): +def makeDateIndex(k=10, freq='B', name=None): dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k, freq=freq) - return DatetimeIndex(dr) + dr = bdate_range(dt, periods=k, freq=freq, name=name) + return DatetimeIndex(dr, name=name) -def makeTimedeltaIndex(k=10, freq='D'): - return TimedeltaIndex(start='1 day',periods=k,freq=freq) +def makeTimedeltaIndex(k=10, freq='D', name=None): + return TimedeltaIndex(start='1 day', periods=k, freq=freq, name=name) -def makePeriodIndex(k=10): +def makePeriodIndex(k=10, name=None): dt = datetime(2000, 1, 1) - dr = PeriodIndex(start=dt, periods=k, freq='B') + dr = PeriodIndex(start=dt, periods=k, freq='B', name=name) return dr def all_index_generator(k=10): @@ -885,21 +885,21 @@ def all_timeseries_index_generator(k=10): # make series -def makeFloatSeries(): +def makeFloatSeries(name=None): index = makeStringIndex(N) - return Series(randn(N), index=index) + return Series(randn(N), index=index, name=name) -def makeStringSeries(): +def makeStringSeries(name=None): index = makeStringIndex(N) - return Series(randn(N), index=index) + return Series(randn(N), index=index, name=name) -def makeObjectSeries(): +def makeObjectSeries(name=None): dateIndex = makeDateIndex(N) dateIndex = Index(dateIndex, dtype=object) index = makeStringIndex(N) - return Series(dateIndex, index=index) + return Series(dateIndex, index=index, name=name) def getSeriesData(): @@ -907,16 +907,16 @@ def getSeriesData(): return dict((c, Series(randn(N), index=index)) for c in getCols(K)) -def makeTimeSeries(nper=None, freq='B'): +def makeTimeSeries(nper=None, freq='B', name=None): if nper is None: nper = N - return Series(randn(nper), index=makeDateIndex(nper, freq=freq)) + return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) -def makePeriodSeries(nper=None): +def makePeriodSeries(nper=None, name=None): if nper is None: nper = N - return Series(randn(nper), index=makePeriodIndex(nper)) + return Series(randn(nper), index=makePeriodIndex(nper), name=name) def getTimeSeriesData(nper=None, freq='B'):