From 99c95b4a6373f30d453199feebd2dce92a60733f Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Wed, 17 Dec 2014 19:02:19 -0500 Subject: [PATCH] overflow bug in multi-index when checking for duplicates --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.16.0.txt | 2 +- pandas/core/index.py | 59 ++++++++++++++++++++++-------- pandas/tests/test_base.py | 3 ++ pandas/tests/test_index.py | 63 +++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 16 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d2f94c22f0335..b6fd14f425bd0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1176,6 +1176,7 @@ Attributes Index.is_monotonic_increasing Index.is_monotonic_decreasing Index.is_unique + Index.has_duplicates Index.dtype Index.inferred_type Index.is_all_dates diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 74fcf2e1fbf06..8720774b821a2 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -50,4 +50,4 @@ Bug Fixes .. _whatsnew_0160.bug_fixes: - Fixed compatibility issue in ``DatetimeIndex`` affecting architectures where ``numpy.int_`` defaults to ``numpy.int32`` (:issue:`8943`) - +- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 0f682893490dd..d0253efb180f6 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3,7 +3,7 @@ import warnings import operator from functools import partial -from pandas.compat import range, zip, lrange, lzip, u, reduce +from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat import numpy as np @@ -600,6 +600,10 @@ def is_unique(self): """ return if the index has unique values """ return self._engine.is_unique + @property + def has_duplicates(self): + return not self.is_unique + def is_boolean(self): return self.inferred_type in ['boolean'] @@ -3218,22 +3222,47 @@ def _has_complex_internals(self): # to disable groupby tricks return True - @property - def has_duplicates(self): - """ - Return True if there are no unique groups - """ - # has duplicates - shape = [len(lev) for lev in self.levels] - group_index = np.zeros(len(self), dtype='i8') - for i in range(len(shape)): - stride = np.prod([x for x in shape[i + 1:]], dtype='i8') - group_index += self.labels[i] * stride + @cache_readonly + def is_unique(self): + from pandas.hashtable import Int64HashTable - if len(np.unique(group_index)) < len(group_index): - return True + def _get_group_index(labels, shape): + from pandas.core.groupby import _int64_overflow_possible, \ + _compress_group_index - return False + # how many levels can be done without overflow + pred = lambda i: not _int64_overflow_possible(shape[:i]) + nlev = next(filter(pred, range(len(shape), 0, -1))) + + # compute group indicies for the first `nlev` levels + group_index = labels[0].astype('i8', subok=False, copy=True) + stride = shape[0] + + for i in range(1, nlev): + group_index += labels[i] * stride + stride *= shape[i] + + if nlev == len(shape): + return group_index + + comp_ids, obs_ids = _compress_group_index(group_index, sort=False) + + labels = [comp_ids] + labels[nlev:] + shape = [len(obs_ids)] + shape[nlev:] + + return _get_group_index(labels, shape) + + def _maybe_lift(lab, size): # pormote nan values + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + shape = map(len, self.levels) + labels = map(_ensure_int64, self.labels) + + labels, shape = map(list, zip(*map(_maybe_lift, labels, shape))) + group_index = _get_group_index(labels, shape) + + table = Int64HashTable(min(1 << 20, len(group_index))) + return len(table.unique(group_index)) == len(self) def get_value(self, series, key): # somewhat broken encapsulation diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 615346f34b5bf..be5e102691fa0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self): tm.assert_index_equal(result, original) self.assertFalse(result is original) + # has_duplicates + self.assertFalse(original.has_duplicates) + # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 575c4e8cb4140..fd2e83e9609c5 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3451,6 +3451,69 @@ def test_has_duplicates(self): [0, 1, 2, 0, 0, 1, 2]]) self.assertTrue(index.has_duplicates) + # GH 9075 + t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169), + (u'x', u'out', u'z', 7, u'y', u'in', u'z', 119), + (u'x', u'out', u'z', 9, u'y', u'in', u'z', 135), + (u'x', u'out', u'z', 13, u'y', u'in', u'z', 145), + (u'x', u'out', u'z', 14, u'y', u'in', u'z', 158), + (u'x', u'out', u'z', 16, u'y', u'in', u'z', 122), + (u'x', u'out', u'z', 17, u'y', u'in', u'z', 160), + (u'x', u'out', u'z', 18, u'y', u'in', u'z', 180), + (u'x', u'out', u'z', 20, u'y', u'in', u'z', 143), + (u'x', u'out', u'z', 21, u'y', u'in', u'z', 128), + (u'x', u'out', u'z', 22, u'y', u'in', u'z', 129), + (u'x', u'out', u'z', 25, u'y', u'in', u'z', 111), + (u'x', u'out', u'z', 28, u'y', u'in', u'z', 114), + (u'x', u'out', u'z', 29, u'y', u'in', u'z', 121), + (u'x', u'out', u'z', 31, u'y', u'in', u'z', 126), + (u'x', u'out', u'z', 32, u'y', u'in', u'z', 155), + (u'x', u'out', u'z', 33, u'y', u'in', u'z', 123), + (u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)] + + index = pd.MultiIndex.from_tuples(t) + self.assertFalse(index.has_duplicates) + + # handle int64 overflow if possible + def check(nlevels, with_nulls): + labels = np.tile(np.arange(500), 2) + level = np.arange(500) + + if with_nulls: # inject some null values + labels[500] = -1 # common nan value + labels = list(labels.copy() for i in range(nlevels)) + for i in range(nlevels): + labels[i][500 + i - nlevels // 2 ] = -1 + + labels += [np.array([-1, 1]).repeat(500)] + else: + labels = [labels] * nlevels + [np.arange(2).repeat(500)] + + levels = [level] * nlevels + [[0, 1]] + + # no dups + index = MultiIndex(levels=levels, labels=labels) + self.assertFalse(index.has_duplicates) + + # with a dup + if with_nulls: + f = lambda a: np.insert(a, 1000, a[0]) + labels = list(map(f, labels)) + index = MultiIndex(levels=levels, labels=labels) + else: + values = index.values.tolist() + index = MultiIndex.from_tuples(values + [values[0]]) + + self.assertTrue(index.has_duplicates) + + # no overflow + check(4, False) + check(4, True) + + # overflow possible + check(8, False) + check(8, True) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values)