From f06f6120a72f4dcd36c0a7377f5bf1c2c9b459d3 Mon Sep 17 00:00:00 2001 From: Niklas Fiekas Date: Fri, 10 Mar 2017 11:02:24 +0100 Subject: [PATCH] bpo-29782: Use __builtin_clzl for bits_in_digit if available --- Include/Python.h | 1 + Include/pyintrinsics.h | 29 +++++++++++++++++++++++++++++ Makefile.pre.in | 2 ++ Modules/mathmodule.c | 38 ++++++++------------------------------ Objects/longobject.c | 38 +++++++++----------------------------- Python/pyintrinsics.c | 18 ++++++++++++++++++ 6 files changed, 67 insertions(+), 59 deletions(-) create mode 100644 Include/pyintrinsics.h create mode 100644 Python/pyintrinsics.c diff --git a/Include/Python.h b/Include/Python.h index 4c7c9a48c81c2a..461a62d3d8fcfa 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -64,6 +64,7 @@ #include "pymath.h" #include "pytime.h" #include "pymem.h" +#include "pyintrinsics.h" #include "object.h" #include "objimpl.h" diff --git a/Include/pyintrinsics.h b/Include/pyintrinsics.h new file mode 100644 index 00000000000000..54718998cb5772 --- /dev/null +++ b/Include/pyintrinsics.h @@ -0,0 +1,29 @@ +#ifndef Py_PYINTRINSICS_H +#define Py_PYINTRINSICS_H + +/* Return the smallest integer k such that n < 2**k, or 0 if n == 0. + * Equivalent to floor(lg(x))+1. Also equivalent to: bitwidth_of_type - + * count_leading_zero_bits(x) + */ + +#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ >= 4)) +#define HAVE_BIT_LENGTH +static inline unsigned int _Py_bit_length(unsigned long d) { + return d ? (8 * sizeof(unsigned long) - __builtin_clzl(d)) : 0; +} +#elif defined(_MSC_VER) +#define HAVE_BIT_LENGTH +#pragma intrinsic(_BitScanReverse) +#include +static inline unsigned int _Py_bit_length(unsigned long d) { + unsigned long idx; + if (_BitScanReverse(&idx, d)) + return idx + 1; + else + return 0; +} +#else +extern unsigned int _Py_bit_length(unsigned long); +#endif + +#endif /* Py_PYINTRINSICS_H */ diff --git a/Makefile.pre.in b/Makefile.pre.in index 4145634c032d54..0ae6f898ca37c1 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -377,6 +377,7 @@ PYTHON_OBJS= \ Python/pyfpe.o \ Python/pyhash.o \ Python/pylifecycle.o \ + Python/pyintrinsics.o \ Python/pymath.o \ Python/pystate.o \ Python/pythonrun.o \ @@ -959,6 +960,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/pyfpe.h \ $(srcdir)/Include/pyhash.h \ $(srcdir)/Include/pylifecycle.h \ + $(srcdir)/Include/pyintrinsics.h \ $(srcdir)/Include/pymath.h \ $(srcdir)/Include/pygetopt.h \ $(srcdir)/Include/pymacro.h \ diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index d5a8ca1ebefdc1..85f8aef0ff2cc9 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -1315,28 +1315,6 @@ math_fsum(PyObject *module, PyObject *seq) #undef NUM_PARTIALS -/* Return the smallest integer k such that n < 2**k, or 0 if n == 0. - * Equivalent to floor(lg(x))+1. Also equivalent to: bitwidth_of_type - - * count_leading_zero_bits(x) - */ - -/* XXX: This routine does more or less the same thing as - * bits_in_digit() in Objects/longobject.c. Someday it would be nice to - * consolidate them. On BSD, there's a library function called fls() - * that we could use, and GCC provides __builtin_clz(). - */ - -static unsigned long -bit_length(unsigned long n) -{ - unsigned long len = 0; - while (n != 0) { - ++len; - n >>= 1; - } - return len; -} - static unsigned long count_set_bits(unsigned long n) { @@ -1415,7 +1393,7 @@ count_set_bits(unsigned long n) /* factorial_partial_product: Compute product(range(start, stop, 2)) using * divide and conquer. Assumes start and stop are odd and stop > start. - * max_bits must be >= bit_length(stop - 2). */ + * max_bits must be >= _Py_bit_length(stop - 2). */ static PyObject * factorial_partial_product(unsigned long start, unsigned long stop, @@ -1430,14 +1408,14 @@ factorial_partial_product(unsigned long start, unsigned long stop, * the answer. * * Storing some integer z requires floor(lg(z))+1 bits, which is - * conveniently the value returned by bit_length(z). The + * conveniently the value returned by _Py_bit_length(z). The * product x*y will require at most - * bit_length(x) + bit_length(y) bits to store, based + * _Py_bit_length(x) + _Py_bit_length(y) bits to store, based * on the idea that lg product = lg x + lg y. * * We know that stop - 2 is the largest number to be multiplied. From - * there, we have: bit_length(answer) <= num_operands * - * bit_length(stop - 2) + * there, we have: _Py_bit_length(answer) <= num_operands * + * _Py_bit_length(stop - 2) */ num_operands = (stop - start) / 2; @@ -1454,7 +1432,7 @@ factorial_partial_product(unsigned long start, unsigned long stop, /* find midpoint of range(start, stop), rounded up to next odd number. */ midpoint = (start + num_operands) | 1; left = factorial_partial_product(start, midpoint, - bit_length(midpoint - 2)); + _Py_bit_length(midpoint - 2)); if (left == NULL) goto error; right = factorial_partial_product(midpoint, stop, max_bits); @@ -1484,7 +1462,7 @@ factorial_odd_part(unsigned long n) Py_INCREF(outer); upper = 3; - for (i = bit_length(n) - 2; i >= 0; i--) { + for (i = _Py_bit_length(n) - 2; i >= 0; i--) { v = n >> i; if (v <= 2) continue; @@ -1494,7 +1472,7 @@ factorial_odd_part(unsigned long n) /* Here inner is the product of all odd integers j in the range (0, n/2**(i+1)]. The factorial_partial_product call below gives the product of all odd integers j in the range (n/2**(i+1), n/2**i]. */ - partial = factorial_partial_product(lower, upper, bit_length(upper-2)); + partial = factorial_partial_product(lower, upper, _Py_bit_length(upper-2)); /* inner *= partial */ if (partial == NULL) goto error; diff --git a/Objects/longobject.c b/Objects/longobject.c index 0bf6ae6accd77d..e4a607aefc07ba 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -717,26 +717,6 @@ _PyLong_Sign(PyObject *vv) return Py_SIZE(v) == 0 ? 0 : (Py_SIZE(v) < 0 ? -1 : 1); } -/* bits_in_digit(d) returns the unique integer k such that 2**(k-1) <= d < - 2**k if d is nonzero, else 0. */ - -static const unsigned char BitLengthTable[32] = { - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 -}; - -static int -bits_in_digit(digit d) -{ - int d_bits = 0; - while (d >= 32) { - d_bits += 6; - d >>= 6; - } - d_bits += (int)BitLengthTable[d]; - return d_bits; -} - size_t _PyLong_NumBits(PyObject *vv) { @@ -754,7 +734,7 @@ _PyLong_NumBits(PyObject *vv) if ((size_t)(ndigits - 1) > SIZE_MAX / (size_t)PyLong_SHIFT) goto Overflow; result = (size_t)(ndigits - 1) * (size_t)PyLong_SHIFT; - msd_bits = bits_in_digit(msd); + msd_bits = _Py_bit_length(msd); if (SIZE_MAX - msd_bits < result) goto Overflow; result += msd_bits; @@ -1820,7 +1800,7 @@ long_format_binary(PyObject *aa, int base, int alternate, return -1; } size_a_in_bits = (size_a - 1) * PyLong_SHIFT + - bits_in_digit(a->ob_digit[size_a - 1]); + _Py_bit_length(a->ob_digit[size_a - 1]); /* Allow 1 character for a '-' sign. */ sz = negative + (size_a_in_bits + (bits - 1)) / bits; } @@ -2638,7 +2618,7 @@ x_divrem(PyLongObject *v1, PyLongObject *w1, PyLongObject **prem) /* normalize: shift w1 left so that its top digit is >= PyLong_BASE/2. shift v1 left by the same amount. Results go into w and v. */ - d = PyLong_SHIFT - bits_in_digit(w1->ob_digit[size_w-1]); + d = PyLong_SHIFT - _Py_bit_length(w1->ob_digit[size_w-1]); carry = v_lshift(w->ob_digit, w1->ob_digit, size_w, d); assert(carry == 0); carry = v_lshift(v->ob_digit, v1->ob_digit, size_v, d); @@ -2759,7 +2739,7 @@ _PyLong_Frexp(PyLongObject *a, Py_ssize_t *e) *e = 0; return 0.0; } - a_bits = bits_in_digit(a->ob_digit[a_size-1]); + a_bits = _Py_bit_length(a->ob_digit[a_size-1]); /* The following is an overflow-free version of the check "if ((a_size - 1) * PyLong_SHIFT + a_bits > PY_SSIZE_T_MAX) ..." */ if (a_size >= (PY_SSIZE_T_MAX - 1) / PyLong_SHIFT + 1 && @@ -3892,8 +3872,8 @@ long_true_divide(PyObject *v, PyObject *w) /* Extreme underflow */ goto underflow_or_zero; /* Next line is now safe from overflowing a Py_ssize_t */ - diff = diff * PyLong_SHIFT + bits_in_digit(a->ob_digit[a_size - 1]) - - bits_in_digit(b->ob_digit[b_size - 1]); + diff = diff * PyLong_SHIFT + _Py_bit_length(a->ob_digit[a_size - 1]) - + _Py_bit_length(b->ob_digit[b_size - 1]); /* Now diff = a_bits - b_bits. */ if (diff > DBL_MAX_EXP) goto overflow; @@ -3969,7 +3949,7 @@ long_true_divide(PyObject *v, PyObject *w) } x_size = Py_ABS(Py_SIZE(x)); assert(x_size > 0); /* result of division is never zero */ - x_bits = (x_size-1)*PyLong_SHIFT+bits_in_digit(x->ob_digit[x_size-1]); + x_bits = (x_size-1)*PyLong_SHIFT+_Py_bit_length(x->ob_digit[x_size-1]); /* The number of extra bits that have to be rounded away. */ extra_bits = Py_MAX(x_bits, DBL_MIN_EXP - shift) - DBL_MANT_DIG; @@ -4611,7 +4591,7 @@ _PyLong_GCD(PyObject *aarg, PyObject *barg) alloc_b = Py_SIZE(b); /* reduce until a fits into 2 digits */ while ((size_a = Py_SIZE(a)) > 2) { - nbits = bits_in_digit(a->ob_digit[size_a-1]); + nbits = _Py_bit_length(a->ob_digit[size_a-1]); /* extract top 2*PyLong_SHIFT bits of a into x, along with corresponding bits of b into y */ size_b = Py_SIZE(b); @@ -5132,7 +5112,7 @@ int_bit_length_impl(PyObject *self) return PyLong_FromLong(0); msd = ((PyLongObject *)self)->ob_digit[ndigits-1]; - msd_bits = bits_in_digit(msd); + msd_bits = _Py_bit_length(msd); if (ndigits <= PY_SSIZE_T_MAX/PyLong_SHIFT) return PyLong_FromSsize_t((ndigits-1)*PyLong_SHIFT + msd_bits); diff --git a/Python/pyintrinsics.c b/Python/pyintrinsics.c new file mode 100644 index 00000000000000..74d58e1ab9d73f --- /dev/null +++ b/Python/pyintrinsics.c @@ -0,0 +1,18 @@ +#include "Python.h" + +#ifndef HAVE_BIT_LENGTH +static const unsigned char BitLengthTable[32] = { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 +}; + +unsigned int _Py_bit_length(unsigned long d) { + unsigned int d_bits = 0; + while (d >= 32) { + d_bits += 6; + d >>= 6; + } + d_bits += (unsigned int)BitLengthTable[d]; + return d_bits; +} +#endif /* HAVE_BIT_LENGTH */