diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py new file mode 100644 index 0000000000000..17bf434acf38a --- /dev/null +++ b/asv_bench/benchmarks/hash_functions.py @@ -0,0 +1,164 @@ +import numpy as np + +import pandas as pd + + +class IsinAlmostFullWithRandomInt: + params = [ + [np.float64, np.int64, np.uint64, np.object], + range(10, 21), + ] + param_names = ["dtype", "exponent"] + + def setup(self, dtype, exponent): + M = 3 * 2 ** (exponent - 2) + # 0.77-the maximal share of occupied buckets + np.random.seed(42) + self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype) + self.values = np.random.randint(0, M, M).astype(dtype) + self.values_outside = self.values + M + + def time_isin(self, dtype, exponent): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, exponent): + self.s.isin(self.values_outside) + + +class IsinWithRandomFloat: + params = [ + [np.float64, np.object], + [ + 1_300, + 2_000, + 7_000, + 8_000, + 70_000, + 80_000, + 750_000, + 900_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + np.random.seed(42) + self.values = np.random.rand(M) + self.s = pd.Series(self.values).astype(dtype) + np.random.shuffle(self.values) + self.values_outside = self.values + 0.1 + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, M): + self.s.isin(self.values_outside) + + +class IsinWithArangeSorted: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + 100_000, + 1_000_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + self.s = pd.Series(np.arange(M)).astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + +class IsinWithArange: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + ], + [-2, 0, 2], + ] + param_names = ["dtype", "M", "offset_factor"] + + def setup(self, dtype, M, offset_factor): + offset = int(M * offset_factor) + np.random.seed(42) + tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)) + self.s = tmp.astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M, offset_factor): + self.s.isin(self.values) + + +class Float64GroupIndex: + # GH28303 + def setup(self): + self.df = pd.date_range( + start="1/1/2018", end="1/2/2018", periods=1e6 + ).to_frame() + self.group_index = np.round(self.df.index.astype(int) / 1e9) + + def time_groupby(self): + self.df.groupby(self.group_index).last() + + +class UniqueAndFactorizeArange: + params = range(4, 16) + param_names = ["exponent"] + + def setup(self, exponent): + a = np.arange(10 ** 4, dtype="float64") + self.a2 = (a + 10 ** exponent).repeat(100) + + def time_factorize(self, exponent): + pd.factorize(self.a2) + + def time_unique(self, exponent): + pd.unique(self.a2) + + +class NumericSeriesIndexing: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] + + +class NumericSeriesIndexingShuffled: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + np.random.seed(42) + np.random.shuffle(vals) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 09cb024cbd95c..54ca602ff3c86 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -376,6 +376,7 @@ Performance improvements - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) +- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 916838d1e9584..61a4e80ea8cbc 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -143,10 +143,86 @@ typedef khint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) + +// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp +khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle 4 bytes: + k *= M_32; + k ^= k >> R_32; + k *= M_32; + + h *= M_32; + h ^= k; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. (Really needed here?) + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + +// it is possible to have a special x64-version, which would need less operations, but +// using 32bit version always has also some benifits: +// - one code for 32bit and 64bit builds +// - the same case for 32bit and 64bit builds +// - no performance difference could be measured compared to a possible x64-version + +khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle first 4 bytes: + k1 *= M_32; + k1 ^= k1 >> R_32; + k1 *= M_32; + + h *= M_32; + h ^= k1; + + //handle second 4 bytes: + k2 *= M_32; + k2 ^= k2 >> R_32; + k2 *= M_32; + + h *= M_32; + h ^= k2; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + +khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){ + khint32_t k1 = (khint32_t)k; + khint32_t k2 = (khint32_t)(k >> 32); + + return murmur2_32_32to32(k1, k2); +} + + #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else -#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 2b46d30c3adb6..aebc229abddd2 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -13,25 +13,31 @@ // is 64 bits the truncation causes collission issues. Given all that, we use our own // simple hash, viewing the double bytes as an int64 and using khash's default // hash for 64 bit integers. -// GH 13436 +// GH 13436 showed that _Py_HashDouble doesn't work well with khash +// GH 28303 showed, that the simple xoring-version isn't good enough +// See GH 36729 for evaluation of the currently used murmur2-hash version + khint64_t PANDAS_INLINE asint64(double key) { - khint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; + khint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } -// correct for all inputs but not -0.0 and NaNs -#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) - -// correct for all inputs but not NaNs -#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \ - kh_float64_hash_func_0_NAN(0.0) : \ - kh_float64_hash_func_0_NAN(key)) +#define ZERO_HASH 0 +#define NAN_HASH 0 -// correct for all -#define kh_float64_hash_func(key) ((key) != (key) ? \ - kh_float64_hash_func_NAN(Py_NAN) : \ - kh_float64_hash_func_NAN(key)) +khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint64_t as_int = asint64(val); + return murmur2_64to32(as_int); +} #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) diff --git a/pandas/core/base.py b/pandas/core/base.py index 4760b92ad5fec..b3366cca37617 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -982,9 +982,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 - 1.0 1 2.0 1 4.0 1 + 1.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -993,9 +993,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 - 1.0 0.2 2.0 0.2 4.0 0.2 + 1.0 0.2 dtype: float64 **bins** @@ -1017,10 +1017,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 - 1.0 1 2.0 1 - 4.0 1 NaN 1 + 4.0 1 + 1.0 1 dtype: int64 """ result = value_counts( diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 1a6cba1ace35f..e9713e38f9874 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -232,18 +232,14 @@ def test_value_counts_datetime64(index_or_series): # with NaT s = df["dt"].copy() - s = klass(list(s.values) + [pd.NaT]) + s = klass(list(s.values) + [pd.NaT] * 4) result = s.value_counts() assert result.index.dtype == "datetime64[ns]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - # GH 35922. NaN-like now sorts to the beginning of duplicate counts - idx = pd.to_datetime( - ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"] - ) - expected_s = Series([3, 2, 1, 1], index=idx) + expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) tm.assert_series_equal(result, expected_s) unique = s.unique() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 88286448de900..34b7d0e73e914 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1173,12 +1173,12 @@ def test_dropna(self): ) tm.assert_series_equal( - Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True), + Series([3, 2], index=[True, False]), ) tm.assert_series_equal( - Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, np.nan, False]), + Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), + Series([5, 3, 2], index=[True, False, np.nan]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), @@ -1194,26 +1194,24 @@ def test_dropna(self): Series([2, 1], index=[5.0, 10.3]), ) - # 32-bit linux has a different ordering - if IS64: - result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5.0, np.nan, 10.3]) - tm.assert_series_equal(result, expected) + result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan]) + tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 - s = Series([1, 2, np.nan, np.nan, np.nan]) + s = Series([1] * 2 + [2] * 3 + [np.nan] * 5) dtypes = (np.float64, object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) expected = Series( - [0.6, 0.2, 0.2], index=Series([np.nan, 1.0, 2.0], dtype=t) + [0.5, 0.3, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], index=Series([1.0, 2.0], dtype=t)) + expected = Series([0.6, 0.4], index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self):