From 814ec4dc4268d270cbb59d007158ffc77a6f9ed7 Mon Sep 17 00:00:00 2001 From: Max Horn Date: Wed, 11 Jan 2017 19:48:32 +0100 Subject: [PATCH] kernel: rewrite (C)Log2Int using compiler builtins Compared to the old CLog2Int code, on my machine the new code using compiler builtins is between 5 and 10 times faster, and the new generic code is still 1.2 to 4 times faster. The new FuncLog2Int calls CLog2Int (or rather, the new static inline function CLog2UInt which expects an unsigned argument), and thus benefits from the optimizations as well (though not by as much). Some micro benchmarks, using the old code: gap> x:=2^0;; for a in [0..2^25] do Log2Int(x); od; time; 998 gap> x:=2^50;; for a in [0..2^25] do Log2Int(x); od; time; 1916 gap> x:=2^60;; for a in [0..2^25] do Log2Int(x); od; time; 1312 gap> x:=2^80;; for a in [0..2^25] do Log2Int(x); od; time; 2773 gap> x:=2^180;; for a in [0..2^25] do Log2Int(x); od; time; 1504 New code: gap> x:=2^0;; for a in [0..2^25] do Log2Int(x); od; time; 955 gap> x:=2^50;; for a in [0..2^25] do Log2Int(x); od; time; 970 gap> x:=2^60;; for a in [0..2^25] do Log2Int(x); od; time; 1045 gap> x:=2^80;; for a in [0..2^25] do Log2Int(x); od; time; 1020 gap> x:=2^180;; for a in [0..2^25] do Log2Int(x); od; time; 1007 --- cnf/config.hin | 9 +++++ cnf/configure.in | 3 ++ cnf/configure.out | 96 +++++++++++++++++++++++++++++++++++++++++++++++ src/gmpints.c | 85 +++++++++++++++++++++++++++-------------- 4 files changed, 164 insertions(+), 29 deletions(-) diff --git a/cnf/config.hin b/cnf/config.hin index a27a32c473..dade4ee603 100644 --- a/cnf/config.hin +++ b/cnf/config.hin @@ -376,6 +376,15 @@ /* Define to 1 if you have the `_setjmp' function. */ #undef HAVE__SETJMP +/* Define to 1 if the system has the `__builtin_clz' built-in function */ +#undef HAVE___BUILTIN_CLZ + +/* Define to 1 if the system has the `__builtin_clzl' built-in function */ +#undef HAVE___BUILTIN_CLZL + +/* Define to 1 if the system has the `__builtin_clzll' built-in function */ +#undef HAVE___BUILTIN_CLZLL + /* Define to 1 if the system has the `__builtin_smulll_overflow' built-in function */ #undef HAVE___BUILTIN_SMULLL_OVERFLOW diff --git a/cnf/configure.in b/cnf/configure.in index 259ec108d3..d99be9c429 100644 --- a/cnf/configure.in +++ b/cnf/configure.in @@ -68,6 +68,9 @@ AC_DEFUN([CHECK_COMPILER_BUILTIN], CHECK_COMPILER_BUILTIN([__builtin_smul_overflow],[0,0,0]); CHECK_COMPILER_BUILTIN([__builtin_smull_overflow],[0,0,0]); CHECK_COMPILER_BUILTIN([__builtin_smulll_overflow],[0,0,0]); +CHECK_COMPILER_BUILTIN([__builtin_clz],[0]); +CHECK_COMPILER_BUILTIN([__builtin_clzl],[0]); +CHECK_COMPILER_BUILTIN([__builtin_clzll],[0]); # diff --git a/cnf/configure.out b/cnf/configure.out index 02d029b608..474e140778 100755 --- a/cnf/configure.out +++ b/cnf/configure.out @@ -4521,6 +4521,102 @@ cat >>confdefs.h <<_ACEOF #define HAVE___BUILTIN_SMULLL_OVERFLOW 1 _ACEOF +fi; +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_clz" >&5 +$as_echo_n "checking for __builtin_clz... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +__builtin_clz(0); + + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + have___builtin_clz=yes +else + have___builtin_clz=no + +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $have___builtin_clz" >&5 +$as_echo "$have___builtin_clz" >&6; } + if test yes = $have___builtin_clz; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE___BUILTIN_CLZ 1 +_ACEOF + +fi; +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_clzl" >&5 +$as_echo_n "checking for __builtin_clzl... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +__builtin_clzl(0); + + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + have___builtin_clzl=yes +else + have___builtin_clzl=no + +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $have___builtin_clzl" >&5 +$as_echo "$have___builtin_clzl" >&6; } + if test yes = $have___builtin_clzl; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE___BUILTIN_CLZL 1 +_ACEOF + +fi; +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_clzll" >&5 +$as_echo_n "checking for __builtin_clzll... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +__builtin_clzll(0); + + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + have___builtin_clzll=yes +else + have___builtin_clzll=no + +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $have___builtin_clzll" >&5 +$as_echo "$have___builtin_clzll" >&6; } + if test yes = $have___builtin_clzll; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE___BUILTIN_CLZLL 1 +_ACEOF + fi; diff --git a/src/gmpints.c b/src/gmpints.c index aa7cee95c6..90d03812ba 100644 --- a/src/gmpints.c +++ b/src/gmpints.c @@ -638,21 +638,55 @@ Obj FuncIntHexString( Obj self, Obj str ) /**************************************************************************** ** ** Implementation of Log2Int for C integers. +** +** When available, we try to use GCC builtins. Otherwise, fall back to code +** based on https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup. +** On a test machine with x86 64bit, the builtins are about 4 times faster +** than the generic code. +** */ +static Int CLog2UInt(UInt a) +{ +#if SIZEOF_VOID_P == SIZEOF_INT && HAVE___BUILTIN_CLZ + return GMP_LIMB_BITS - 1 - __builtin_clz(a); +#elif SIZEOF_VOID_P == SIZEOF_LONG && HAVE___BUILTIN_CLZL + return GMP_LIMB_BITS - 1 - __builtin_clzl(a); +#elif SIZEOF_VOID_P == SIZEOF_LONG_LONG && HAVE___BUILTIN_CLZLL + return GMP_LIMB_BITS - 1 - __builtin_clzll(a); +#else + static const char LogTable256[256] = { + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 + }; + + Int res = 0; + UInt b; + b = a >> 32; if (b) { res+=32; a=b; } + b = a >> 16; if (b) { res+=16; a=b; } + b = a >> 8; if (b) { res+= 8; a=b; } + return res + LogTable256[a]; +#endif +} + Int CLog2Int(Int a) { - Int res, mask; if (a < 0) a = -a; - if (a < 1) return -1; - if (a < 65536) { - for(mask = 2, res = 0; ;mask *= 2, res += 1) { - if(a < mask) return res; - } - } - for(mask = 65536, res = 15; ;mask *= 2, res += 1) { - if(a < mask) return res; - } + return CLog2UInt(a); } /**************************************************************************** @@ -663,29 +697,22 @@ Int CLog2Int(Int a) */ Obj FuncLog2Int( Obj self, Obj integer) { - Int d; - Int a, len; - TypLimb dmask; - - /* case of small ints */ - if (IS_INTOBJ(integer)) { + if ( IS_INTOBJ(integer) ) { return INTOBJ_INT(CLog2Int(INT_INTOBJ(integer))); } - /* case of long ints */ if ( IS_LARGEINT(integer) ) { - for (len = SIZE_INT(integer); ADDR_INT(integer)[len-1] == 0; len--); - /* Instead of computing - res = len * GMP_LIMB_BITS - d; - we keep len and d separate, because on 32 bit systems res may - not fit into an Int (and not into an immediate integer). */ - d = 1; - a = (TypLimb)(ADDR_INT(integer)[len-1]); - for(dmask = (TypLimb)1 << (GMP_LIMB_BITS - 1); - (dmask & a) == 0 && dmask != (TypLimb)0; - dmask = dmask >> 1, d++); - return DiffInt(ProdInt(INTOBJ_INT(len), INTOBJ_INT(GMP_LIMB_BITS)), - INTOBJ_INT(d)); + UInt len = SIZE_INT(integer) - 1; + UInt a = CLog2UInt( ADDR_INT(integer)[len] ); + +#ifdef SYS_IS_64_BIT + return INTOBJ_INT(len * GMP_LIMB_BITS + a); +#else + /* The final result is len * GMP_LIMB_BITS - d, which may not + fit into an immediate integer (at least on a 32bit system) */ + return SumInt(ProdInt(INTOBJ_INT(len), INTOBJ_INT(GMP_LIMB_BITS)), + INTOBJ_INT(a)); +#endif } else { ErrorReturnObj("Log2Int: argument must be a int, (not a %s)",