From f06f6120a72f4dcd36c0a7377f5bf1c2c9b459d3 Mon Sep 17 00:00:00 2001
From: Niklas Fiekas <niklas.fiekas@backscattering.de>
Date: Fri, 10 Mar 2017 11:02:24 +0100
Subject: [PATCH] bpo-29782: Use __builtin_clzl for bits_in_digit if available

---
 Include/Python.h       |  1 +
 Include/pyintrinsics.h | 29 +++++++++++++++++++++++++++++
 Makefile.pre.in        |  2 ++
 Modules/mathmodule.c   | 38 ++++++++------------------------------
 Objects/longobject.c   | 38 +++++++++-----------------------------
 Python/pyintrinsics.c  | 18 ++++++++++++++++++
 6 files changed, 67 insertions(+), 59 deletions(-)
 create mode 100644 Include/pyintrinsics.h
 create mode 100644 Python/pyintrinsics.c

diff --git a/Include/Python.h b/Include/Python.h
index 4c7c9a48c81c2a..461a62d3d8fcfa 100644
--- a/Include/Python.h
+++ b/Include/Python.h
@@ -64,6 +64,7 @@
 #include "pymath.h"
 #include "pytime.h"
 #include "pymem.h"
+#include "pyintrinsics.h"
 
 #include "object.h"
 #include "objimpl.h"
diff --git a/Include/pyintrinsics.h b/Include/pyintrinsics.h
new file mode 100644
index 00000000000000..54718998cb5772
--- /dev/null
+++ b/Include/pyintrinsics.h
@@ -0,0 +1,29 @@
+#ifndef Py_PYINTRINSICS_H
+#define Py_PYINTRINSICS_H
+
+/* Return the smallest integer k such that n < 2**k, or 0 if n == 0.
+ * Equivalent to floor(lg(x))+1.  Also equivalent to: bitwidth_of_type -
+ * count_leading_zero_bits(x)
+ */
+
+#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ >= 4))
+#define HAVE_BIT_LENGTH
+static inline unsigned int _Py_bit_length(unsigned long d) {
+    return d ? (8 * sizeof(unsigned long) - __builtin_clzl(d)) : 0;
+}
+#elif defined(_MSC_VER)
+#define HAVE_BIT_LENGTH
+#pragma intrinsic(_BitScanReverse)
+#include <intrin.h>
+static inline unsigned int _Py_bit_length(unsigned long d) {
+    unsigned long idx;
+    if (_BitScanReverse(&idx, d))
+        return idx + 1;
+    else
+        return 0;
+}
+#else
+extern unsigned int _Py_bit_length(unsigned long);
+#endif
+
+#endif /* Py_PYINTRINSICS_H */
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 4145634c032d54..0ae6f898ca37c1 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -377,6 +377,7 @@ PYTHON_OBJS=	\
 		Python/pyfpe.o \
 		Python/pyhash.o \
 		Python/pylifecycle.o \
+		Python/pyintrinsics.o \
 		Python/pymath.o \
 		Python/pystate.o \
 		Python/pythonrun.o \
@@ -959,6 +960,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/pyfpe.h \
 		$(srcdir)/Include/pyhash.h \
 		$(srcdir)/Include/pylifecycle.h \
+		$(srcdir)/Include/pyintrinsics.h \
 		$(srcdir)/Include/pymath.h \
 		$(srcdir)/Include/pygetopt.h \
 		$(srcdir)/Include/pymacro.h \
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index d5a8ca1ebefdc1..85f8aef0ff2cc9 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -1315,28 +1315,6 @@ math_fsum(PyObject *module, PyObject *seq)
 #undef NUM_PARTIALS
 
 
-/* Return the smallest integer k such that n < 2**k, or 0 if n == 0.
- * Equivalent to floor(lg(x))+1.  Also equivalent to: bitwidth_of_type -
- * count_leading_zero_bits(x)
- */
-
-/* XXX: This routine does more or less the same thing as
- * bits_in_digit() in Objects/longobject.c.  Someday it would be nice to
- * consolidate them.  On BSD, there's a library function called fls()
- * that we could use, and GCC provides __builtin_clz().
- */
-
-static unsigned long
-bit_length(unsigned long n)
-{
-    unsigned long len = 0;
-    while (n != 0) {
-        ++len;
-        n >>= 1;
-    }
-    return len;
-}
-
 static unsigned long
 count_set_bits(unsigned long n)
 {
@@ -1415,7 +1393,7 @@ count_set_bits(unsigned long n)
 
 /* factorial_partial_product: Compute product(range(start, stop, 2)) using
  * divide and conquer.  Assumes start and stop are odd and stop > start.
- * max_bits must be >= bit_length(stop - 2). */
+ * max_bits must be >= _Py_bit_length(stop - 2). */
 
 static PyObject *
 factorial_partial_product(unsigned long start, unsigned long stop,
@@ -1430,14 +1408,14 @@ factorial_partial_product(unsigned long start, unsigned long stop,
      * the answer.
      *
      * Storing some integer z requires floor(lg(z))+1 bits, which is
-     * conveniently the value returned by bit_length(z).  The
+     * conveniently the value returned by _Py_bit_length(z).  The
      * product x*y will require at most
-     * bit_length(x) + bit_length(y) bits to store, based
+     * _Py_bit_length(x) + _Py_bit_length(y) bits to store, based
      * on the idea that lg product = lg x + lg y.
      *
      * We know that stop - 2 is the largest number to be multiplied.  From
-     * there, we have: bit_length(answer) <= num_operands *
-     * bit_length(stop - 2)
+     * there, we have: _Py_bit_length(answer) <= num_operands *
+     * _Py_bit_length(stop - 2)
      */
 
     num_operands = (stop - start) / 2;
@@ -1454,7 +1432,7 @@ factorial_partial_product(unsigned long start, unsigned long stop,
     /* find midpoint of range(start, stop), rounded up to next odd number. */
     midpoint = (start + num_operands) | 1;
     left = factorial_partial_product(start, midpoint,
-                                     bit_length(midpoint - 2));
+                                     _Py_bit_length(midpoint - 2));
     if (left == NULL)
         goto error;
     right = factorial_partial_product(midpoint, stop, max_bits);
@@ -1484,7 +1462,7 @@ factorial_odd_part(unsigned long n)
     Py_INCREF(outer);
 
     upper = 3;
-    for (i = bit_length(n) - 2; i >= 0; i--) {
+    for (i = _Py_bit_length(n) - 2; i >= 0; i--) {
         v = n >> i;
         if (v <= 2)
             continue;
@@ -1494,7 +1472,7 @@ factorial_odd_part(unsigned long n)
         /* Here inner is the product of all odd integers j in the range (0,
            n/2**(i+1)].  The factorial_partial_product call below gives the
            product of all odd integers j in the range (n/2**(i+1), n/2**i]. */
-        partial = factorial_partial_product(lower, upper, bit_length(upper-2));
+        partial = factorial_partial_product(lower, upper, _Py_bit_length(upper-2));
         /* inner *= partial */
         if (partial == NULL)
             goto error;
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 0bf6ae6accd77d..e4a607aefc07ba 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -717,26 +717,6 @@ _PyLong_Sign(PyObject *vv)
     return Py_SIZE(v) == 0 ? 0 : (Py_SIZE(v) < 0 ? -1 : 1);
 }
 
-/* bits_in_digit(d) returns the unique integer k such that 2**(k-1) <= d <
-   2**k if d is nonzero, else 0. */
-
-static const unsigned char BitLengthTable[32] = {
-    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
-};
-
-static int
-bits_in_digit(digit d)
-{
-    int d_bits = 0;
-    while (d >= 32) {
-        d_bits += 6;
-        d >>= 6;
-    }
-    d_bits += (int)BitLengthTable[d];
-    return d_bits;
-}
-
 size_t
 _PyLong_NumBits(PyObject *vv)
 {
@@ -754,7 +734,7 @@ _PyLong_NumBits(PyObject *vv)
         if ((size_t)(ndigits - 1) > SIZE_MAX / (size_t)PyLong_SHIFT)
             goto Overflow;
         result = (size_t)(ndigits - 1) * (size_t)PyLong_SHIFT;
-        msd_bits = bits_in_digit(msd);
+        msd_bits = _Py_bit_length(msd);
         if (SIZE_MAX - msd_bits < result)
             goto Overflow;
         result += msd_bits;
@@ -1820,7 +1800,7 @@ long_format_binary(PyObject *aa, int base, int alternate,
             return -1;
         }
         size_a_in_bits = (size_a - 1) * PyLong_SHIFT +
-                         bits_in_digit(a->ob_digit[size_a - 1]);
+                         _Py_bit_length(a->ob_digit[size_a - 1]);
         /* Allow 1 character for a '-' sign. */
         sz = negative + (size_a_in_bits + (bits - 1)) / bits;
     }
@@ -2638,7 +2618,7 @@ x_divrem(PyLongObject *v1, PyLongObject *w1, PyLongObject **prem)
 
     /* normalize: shift w1 left so that its top digit is >= PyLong_BASE/2.
        shift v1 left by the same amount.  Results go into w and v. */
-    d = PyLong_SHIFT - bits_in_digit(w1->ob_digit[size_w-1]);
+    d = PyLong_SHIFT - _Py_bit_length(w1->ob_digit[size_w-1]);
     carry = v_lshift(w->ob_digit, w1->ob_digit, size_w, d);
     assert(carry == 0);
     carry = v_lshift(v->ob_digit, v1->ob_digit, size_v, d);
@@ -2759,7 +2739,7 @@ _PyLong_Frexp(PyLongObject *a, Py_ssize_t *e)
         *e = 0;
         return 0.0;
     }
-    a_bits = bits_in_digit(a->ob_digit[a_size-1]);
+    a_bits = _Py_bit_length(a->ob_digit[a_size-1]);
     /* The following is an overflow-free version of the check
        "if ((a_size - 1) * PyLong_SHIFT + a_bits > PY_SSIZE_T_MAX) ..." */
     if (a_size >= (PY_SSIZE_T_MAX - 1) / PyLong_SHIFT + 1 &&
@@ -3892,8 +3872,8 @@ long_true_divide(PyObject *v, PyObject *w)
         /* Extreme underflow */
         goto underflow_or_zero;
     /* Next line is now safe from overflowing a Py_ssize_t */
-    diff = diff * PyLong_SHIFT + bits_in_digit(a->ob_digit[a_size - 1]) -
-        bits_in_digit(b->ob_digit[b_size - 1]);
+    diff = diff * PyLong_SHIFT + _Py_bit_length(a->ob_digit[a_size - 1]) -
+        _Py_bit_length(b->ob_digit[b_size - 1]);
     /* Now diff = a_bits - b_bits. */
     if (diff > DBL_MAX_EXP)
         goto overflow;
@@ -3969,7 +3949,7 @@ long_true_divide(PyObject *v, PyObject *w)
     }
     x_size = Py_ABS(Py_SIZE(x));
     assert(x_size > 0); /* result of division is never zero */
-    x_bits = (x_size-1)*PyLong_SHIFT+bits_in_digit(x->ob_digit[x_size-1]);
+    x_bits = (x_size-1)*PyLong_SHIFT+_Py_bit_length(x->ob_digit[x_size-1]);
 
     /* The number of extra bits that have to be rounded away. */
     extra_bits = Py_MAX(x_bits, DBL_MIN_EXP - shift) - DBL_MANT_DIG;
@@ -4611,7 +4591,7 @@ _PyLong_GCD(PyObject *aarg, PyObject *barg)
     alloc_b = Py_SIZE(b);
     /* reduce until a fits into 2 digits */
     while ((size_a = Py_SIZE(a)) > 2) {
-        nbits = bits_in_digit(a->ob_digit[size_a-1]);
+        nbits = _Py_bit_length(a->ob_digit[size_a-1]);
         /* extract top 2*PyLong_SHIFT bits of a into x, along with
            corresponding bits of b into y */
         size_b = Py_SIZE(b);
@@ -5132,7 +5112,7 @@ int_bit_length_impl(PyObject *self)
         return PyLong_FromLong(0);
 
     msd = ((PyLongObject *)self)->ob_digit[ndigits-1];
-    msd_bits = bits_in_digit(msd);
+    msd_bits = _Py_bit_length(msd);
 
     if (ndigits <= PY_SSIZE_T_MAX/PyLong_SHIFT)
         return PyLong_FromSsize_t((ndigits-1)*PyLong_SHIFT + msd_bits);
diff --git a/Python/pyintrinsics.c b/Python/pyintrinsics.c
new file mode 100644
index 00000000000000..74d58e1ab9d73f
--- /dev/null
+++ b/Python/pyintrinsics.c
@@ -0,0 +1,18 @@
+#include "Python.h"
+
+#ifndef HAVE_BIT_LENGTH
+static const unsigned char BitLengthTable[32] = {
+    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+};
+
+unsigned int _Py_bit_length(unsigned long d) {
+    unsigned int d_bits = 0;
+    while (d >= 32) {
+        d_bits += 6;
+        d >>= 6;
+    }
+    d_bits += (unsigned int)BitLengthTable[d];
+    return d_bits;
+}
+#endif /* HAVE_BIT_LENGTH */