From 31faa43406b9b2a4edaacc55ad637a695f0094f7 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Jun 2024 10:55:22 +0100 Subject: [PATCH] [mypyc] Inline fast paths of integer unboxing operations (#17266) This applies to `int` and native integer types. This speeds up this micro-benchmark by up to 80% (it spends most of the time unboxing integers): ``` # a is list[int]/list[i64]/... for i in a: if i == 789: n += 1 ``` The impact to compile time when self-compiling is below the noise floor. The generated binary is about 0.1% larger. Since integer unboxing can be performance-critical, this seems like a decent win. Closes mypyc/mypyc#987. Work on mypyc/mypyc#757. --- mypyc/common.py | 1 + mypyc/lib-rt/CPy.h | 147 +++++++++++++++++++++++++++++++++-- mypyc/lib-rt/int_ops.c | 133 ++----------------------------- mypyc/lib-rt/pythonsupport.c | 106 +++++++++++++++++++++++++ mypyc/lib-rt/pythonsupport.h | 73 ++++------------- mypyc/lib-rt/setup.py | 1 + 6 files changed, 269 insertions(+), 192 deletions(-) create mode 100644 mypyc/lib-rt/pythonsupport.c diff --git a/mypyc/common.py b/mypyc/common.py index d7610fe15c41..31567c689c34 100644 --- a/mypyc/common.py +++ b/mypyc/common.py @@ -79,6 +79,7 @@ "exc_ops.c", "misc_ops.c", "generic_ops.c", + "pythonsupport.c", ] diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index 9e85647226fe..8aa5a77c180c 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -120,9 +120,6 @@ static inline size_t CPy_FindAttrOffset(PyTypeObject *trait, CPyVTableItem *vtab CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value); CPyTagged CPyTagged_FromVoidPtr(void *ptr); CPyTagged CPyTagged_FromInt64(int64_t value); -CPyTagged CPyTagged_FromObject(PyObject *object); -CPyTagged CPyTagged_StealFromObject(PyObject *object); -CPyTagged CPyTagged_BorrowFromObject(PyObject *object); PyObject *CPyTagged_AsObject(CPyTagged x); PyObject *CPyTagged_StealAsObject(CPyTagged x); Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x); @@ -148,18 +145,18 @@ CPyTagged CPyTagged_FromFloat(double f); PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base); PyObject *CPyLong_FromStr(PyObject *o); PyObject *CPyBool_Str(bool b); -int64_t CPyLong_AsInt64(PyObject *o); +int64_t CPyLong_AsInt64_(PyObject *o); int64_t CPyInt64_Divide(int64_t x, int64_t y); int64_t CPyInt64_Remainder(int64_t x, int64_t y); -int32_t CPyLong_AsInt32(PyObject *o); +int32_t CPyLong_AsInt32_(PyObject *o); int32_t CPyInt32_Divide(int32_t x, int32_t y); int32_t CPyInt32_Remainder(int32_t x, int32_t y); void CPyInt32_Overflow(void); -int16_t CPyLong_AsInt16(PyObject *o); +int16_t CPyLong_AsInt16_(PyObject *o); int16_t CPyInt16_Divide(int16_t x, int16_t y); int16_t CPyInt16_Remainder(int16_t x, int16_t y); void CPyInt16_Overflow(void); -uint8_t CPyLong_AsUInt8(PyObject *o); +uint8_t CPyLong_AsUInt8_(PyObject *o); void CPyUInt8_Overflow(void); double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y); @@ -199,6 +196,41 @@ static inline PyObject *CPyTagged_LongAsObject(CPyTagged x) { return (PyObject *)(x & ~CPY_INT_TAG); } +static inline CPyTagged CPyTagged_FromObject(PyObject *object) { + int overflow; + // The overflow check knows about CPyTagged's width + Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); + if (unlikely(overflow != 0)) { + Py_INCREF(object); + return ((CPyTagged)object) | CPY_INT_TAG; + } else { + return value << 1; + } +} + +static inline CPyTagged CPyTagged_StealFromObject(PyObject *object) { + int overflow; + // The overflow check knows about CPyTagged's width + Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); + if (unlikely(overflow != 0)) { + return ((CPyTagged)object) | CPY_INT_TAG; + } else { + Py_DECREF(object); + return value << 1; + } +} + +static inline CPyTagged CPyTagged_BorrowFromObject(PyObject *object) { + int overflow; + // The overflow check knows about CPyTagged's width + Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); + if (unlikely(overflow != 0)) { + return ((CPyTagged)object) | CPY_INT_TAG; + } else { + return value << 1; + } +} + static inline bool CPyTagged_TooBig(Py_ssize_t value) { // Micro-optimized for the common case where it fits. return (size_t)value > CPY_TAGGED_MAX @@ -286,6 +318,107 @@ static inline bool CPyTagged_IsLe(CPyTagged left, CPyTagged right) { } } +static inline int64_t CPyLong_AsInt64(PyObject *o) { + if (likely(PyLong_Check(o))) { + PyLongObject *lobj = (PyLongObject *)o; + Py_ssize_t size = Py_SIZE(lobj); + if (likely(size == 1)) { + // Fast path + return CPY_LONG_DIGIT(lobj, 0); + } else if (likely(size == 0)) { + return 0; + } + } + // Slow path + return CPyLong_AsInt64_(o); +} + +static inline int32_t CPyLong_AsInt32(PyObject *o) { + if (likely(PyLong_Check(o))) { + #if CPY_3_12_FEATURES + PyLongObject *lobj = (PyLongObject *)o; + size_t tag = CPY_LONG_TAG(lobj); + if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { + // Fast path + return CPY_LONG_DIGIT(lobj, 0); + } else if (likely(tag == CPY_SIGN_ZERO)) { + return 0; + } + #else + PyLongObject *lobj = (PyLongObject *)o; + Py_ssize_t size = lobj->ob_base.ob_size; + if (likely(size == 1)) { + // Fast path + return CPY_LONG_DIGIT(lobj, 0); + } else if (likely(size == 0)) { + return 0; + } + #endif + } + // Slow path + return CPyLong_AsInt32_(o); +} + +static inline int16_t CPyLong_AsInt16(PyObject *o) { + if (likely(PyLong_Check(o))) { + #if CPY_3_12_FEATURES + PyLongObject *lobj = (PyLongObject *)o; + size_t tag = CPY_LONG_TAG(lobj); + if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { + // Fast path + digit x = CPY_LONG_DIGIT(lobj, 0); + if (x < 0x8000) + return x; + } else if (likely(tag == CPY_SIGN_ZERO)) { + return 0; + } + #else + PyLongObject *lobj = (PyLongObject *)o; + Py_ssize_t size = lobj->ob_base.ob_size; + if (likely(size == 1)) { + // Fast path + digit x = lobj->ob_digit[0]; + if (x < 0x8000) + return x; + } else if (likely(size == 0)) { + return 0; + } + #endif + } + // Slow path + return CPyLong_AsInt16_(o); +} + +static inline uint8_t CPyLong_AsUInt8(PyObject *o) { + if (likely(PyLong_Check(o))) { + #if CPY_3_12_FEATURES + PyLongObject *lobj = (PyLongObject *)o; + size_t tag = CPY_LONG_TAG(lobj); + if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { + // Fast path + digit x = CPY_LONG_DIGIT(lobj, 0); + if (x < 256) + return x; + } else if (likely(tag == CPY_SIGN_ZERO)) { + return 0; + } + #else + PyLongObject *lobj = (PyLongObject *)o; + Py_ssize_t size = lobj->ob_base.ob_size; + if (likely(size == 1)) { + // Fast path + digit x = lobj->ob_digit[0]; + if (x < 256) + return x; + } else if (likely(size == 0)) { + return 0; + } + #endif + } + // Slow path + return CPyLong_AsUInt8_(o); +} + static inline CPyTagged CPyTagged_Negate(CPyTagged num) { if (likely(CPyTagged_CheckShort(num) && num != (CPyTagged) ((Py_ssize_t)1 << (CPY_INT_BITS - 1)))) { diff --git a/mypyc/lib-rt/int_ops.c b/mypyc/lib-rt/int_ops.c index b1b3d6e125f3..9b5d4ef65fb1 100644 --- a/mypyc/lib-rt/int_ops.c +++ b/mypyc/lib-rt/int_ops.c @@ -44,41 +44,6 @@ CPyTagged CPyTagged_FromInt64(int64_t value) { } } -CPyTagged CPyTagged_FromObject(PyObject *object) { - int overflow; - // The overflow check knows about CPyTagged's width - Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); - if (unlikely(overflow != 0)) { - Py_INCREF(object); - return ((CPyTagged)object) | CPY_INT_TAG; - } else { - return value << 1; - } -} - -CPyTagged CPyTagged_StealFromObject(PyObject *object) { - int overflow; - // The overflow check knows about CPyTagged's width - Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); - if (unlikely(overflow != 0)) { - return ((CPyTagged)object) | CPY_INT_TAG; - } else { - Py_DECREF(object); - return value << 1; - } -} - -CPyTagged CPyTagged_BorrowFromObject(PyObject *object) { - int overflow; - // The overflow check knows about CPyTagged's width - Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow); - if (unlikely(overflow != 0)) { - return ((CPyTagged)object) | CPY_INT_TAG; - } else { - return value << 1; - } -} - PyObject *CPyTagged_AsObject(CPyTagged x) { PyObject *value; if (unlikely(CPyTagged_CheckLong(x))) { @@ -420,18 +385,8 @@ CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right) { return CPyTagged_StealFromObject(result); } -int64_t CPyLong_AsInt64(PyObject *o) { - if (likely(PyLong_Check(o))) { - PyLongObject *lobj = (PyLongObject *)o; - Py_ssize_t size = Py_SIZE(lobj); - if (likely(size == 1)) { - // Fast path - return CPY_LONG_DIGIT(lobj, 0); - } else if (likely(size == 0)) { - return 0; - } - } - // Slow path +// i64 unboxing slow path +int64_t CPyLong_AsInt64_(PyObject *o) { int overflow; int64_t result = PyLong_AsLongLongAndOverflow(o, &overflow); if (result == -1) { @@ -479,29 +434,8 @@ int64_t CPyInt64_Remainder(int64_t x, int64_t y) { return d; } -int32_t CPyLong_AsInt32(PyObject *o) { - if (likely(PyLong_Check(o))) { - #if CPY_3_12_FEATURES - PyLongObject *lobj = (PyLongObject *)o; - size_t tag = CPY_LONG_TAG(lobj); - if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { - // Fast path - return CPY_LONG_DIGIT(lobj, 0); - } else if (likely(tag == CPY_SIGN_ZERO)) { - return 0; - } - #else - PyLongObject *lobj = (PyLongObject *)o; - Py_ssize_t size = lobj->ob_base.ob_size; - if (likely(size == 1)) { - // Fast path - return CPY_LONG_DIGIT(lobj, 0); - } else if (likely(size == 0)) { - return 0; - } - #endif - } - // Slow path +// i32 unboxing slow path +int32_t CPyLong_AsInt32_(PyObject *o) { int overflow; long result = PyLong_AsLongAndOverflow(o, &overflow); if (result > 0x7fffffffLL || result < -0x80000000LL) { @@ -557,33 +491,8 @@ void CPyInt32_Overflow() { PyErr_SetString(PyExc_OverflowError, "int too large to convert to i32"); } -int16_t CPyLong_AsInt16(PyObject *o) { - if (likely(PyLong_Check(o))) { - #if CPY_3_12_FEATURES - PyLongObject *lobj = (PyLongObject *)o; - size_t tag = CPY_LONG_TAG(lobj); - if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { - // Fast path - digit x = CPY_LONG_DIGIT(lobj, 0); - if (x < 0x8000) - return x; - } else if (likely(tag == CPY_SIGN_ZERO)) { - return 0; - } - #else - PyLongObject *lobj = (PyLongObject *)o; - Py_ssize_t size = lobj->ob_base.ob_size; - if (likely(size == 1)) { - // Fast path - digit x = lobj->ob_digit[0]; - if (x < 0x8000) - return x; - } else if (likely(size == 0)) { - return 0; - } - #endif - } - // Slow path +// i16 unboxing slow path +int16_t CPyLong_AsInt16_(PyObject *o) { int overflow; long result = PyLong_AsLongAndOverflow(o, &overflow); if (result > 0x7fff || result < -0x8000) { @@ -639,34 +548,8 @@ void CPyInt16_Overflow() { PyErr_SetString(PyExc_OverflowError, "int too large to convert to i16"); } - -uint8_t CPyLong_AsUInt8(PyObject *o) { - if (likely(PyLong_Check(o))) { - #if CPY_3_12_FEATURES - PyLongObject *lobj = (PyLongObject *)o; - size_t tag = CPY_LONG_TAG(lobj); - if (likely(tag == (1 << CPY_NON_SIZE_BITS))) { - // Fast path - digit x = CPY_LONG_DIGIT(lobj, 0); - if (x < 256) - return x; - } else if (likely(tag == CPY_SIGN_ZERO)) { - return 0; - } - #else - PyLongObject *lobj = (PyLongObject *)o; - Py_ssize_t size = lobj->ob_base.ob_size; - if (likely(size == 1)) { - // Fast path - digit x = lobj->ob_digit[0]; - if (x < 256) - return x; - } else if (likely(size == 0)) { - return 0; - } - #endif - } - // Slow path +// u8 unboxing slow path +uint8_t CPyLong_AsUInt8_(PyObject *o) { int overflow; long result = PyLong_AsLongAndOverflow(o, &overflow); if (result < 0 || result >= 256) { diff --git a/mypyc/lib-rt/pythonsupport.c b/mypyc/lib-rt/pythonsupport.c new file mode 100644 index 000000000000..90fb69705a00 --- /dev/null +++ b/mypyc/lib-rt/pythonsupport.c @@ -0,0 +1,106 @@ +// Collects code that was copied in from cpython, for a couple of different reasons: +// * We wanted to modify it to produce a more efficient version for our uses +// * We needed to call it and it was static :( +// * We wanted to call it and needed to backport it + +#include "pythonsupport.h" + +#if CPY_3_12_FEATURES + +// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined) +Py_ssize_t +CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow) +{ + PyLongObject *v = (PyLongObject *)vv; + size_t x, prev; + Py_ssize_t res; + Py_ssize_t i; + int sign; + + *overflow = 0; + + res = -1; + i = CPY_LONG_TAG(v); + + sign = 1; + x = 0; + if (i & CPY_SIGN_NEGATIVE) { + sign = -1; + } + i >>= CPY_NON_SIZE_BITS; + while (--i >= 0) { + prev = x; + x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i); + if ((x >> PyLong_SHIFT) != prev) { + *overflow = sign; + goto exit; + } + } + /* Haven't lost any bits, but casting to long requires extra + * care. + */ + if (x <= (size_t)CPY_TAGGED_MAX) { + res = (Py_ssize_t)x * sign; + } + else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) { + res = CPY_TAGGED_MIN; + } + else { + *overflow = sign; + /* res is already set to -1 */ + } + exit: + return res; +} + +#else + +// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined, Python 3.11 and earlier) +Py_ssize_t +CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow) +{ + /* This version by Tim Peters */ + PyLongObject *v = (PyLongObject *)vv; + size_t x, prev; + Py_ssize_t res; + Py_ssize_t i; + int sign; + + *overflow = 0; + + res = -1; + i = Py_SIZE(v); + + sign = 1; + x = 0; + if (i < 0) { + sign = -1; + i = -(i); + } + while (--i >= 0) { + prev = x; + x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i); + if ((x >> PyLong_SHIFT) != prev) { + *overflow = sign; + goto exit; + } + } + /* Haven't lost any bits, but casting to long requires extra + * care. + */ + if (x <= (size_t)CPY_TAGGED_MAX) { + res = (Py_ssize_t)x * sign; + } + else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) { + res = CPY_TAGGED_MIN; + } + else { + *overflow = sign; + /* res is already set to -1 */ + } + exit: + return res; +} + + +#endif diff --git a/mypyc/lib-rt/pythonsupport.h b/mypyc/lib-rt/pythonsupport.h index f7d501f44a27..85f9ec64ac90 100644 --- a/mypyc/lib-rt/pythonsupport.h +++ b/mypyc/lib-rt/pythonsupport.h @@ -129,6 +129,9 @@ init_subclass(PyTypeObject *type, PyObject *kwds) return 0; } +Py_ssize_t +CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow); + #if CPY_3_12_FEATURES static inline Py_ssize_t @@ -136,10 +139,8 @@ CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow) { /* This version by Tim Peters */ PyLongObject *v = (PyLongObject *)vv; - size_t x, prev; Py_ssize_t res; Py_ssize_t i; - int sign; *overflow = 0; @@ -154,35 +155,12 @@ CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow) } else if (i == ((1 << CPY_NON_SIZE_BITS) | CPY_SIGN_NEGATIVE)) { res = -(sdigit)CPY_LONG_DIGIT(v, 0); } else { - sign = 1; - x = 0; - if (i & CPY_SIGN_NEGATIVE) { - sign = -1; - } - i >>= CPY_NON_SIZE_BITS; - while (--i >= 0) { - prev = x; - x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i); - if ((x >> PyLong_SHIFT) != prev) { - *overflow = sign; - goto exit; - } - } - /* Haven't lost any bits, but casting to long requires extra - * care (see comment above). - */ - if (x <= (size_t)CPY_TAGGED_MAX) { - res = (Py_ssize_t)x * sign; - } - else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) { - res = CPY_TAGGED_MIN; - } - else { - *overflow = sign; - /* res is already set to -1 */ - } + // Slow path is moved to a non-inline helper function to + // limit size of generated code + int overflow_local; + res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local); + *overflow = overflow_local; } - exit: return res; } @@ -204,10 +182,8 @@ CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow) { /* This version by Tim Peters */ PyLongObject *v = (PyLongObject *)vv; - size_t x, prev; Py_ssize_t res; Py_ssize_t i; - int sign; *overflow = 0; @@ -221,35 +197,12 @@ CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow) } else if (i == -1) { res = -(sdigit)CPY_LONG_DIGIT(v, 0); } else { - sign = 1; - x = 0; - if (i < 0) { - sign = -1; - i = -(i); - } - while (--i >= 0) { - prev = x; - x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i); - if ((x >> PyLong_SHIFT) != prev) { - *overflow = sign; - goto exit; - } - } - /* Haven't lost any bits, but casting to long requires extra - * care (see comment above). - */ - if (x <= (size_t)CPY_TAGGED_MAX) { - res = (Py_ssize_t)x * sign; - } - else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) { - res = CPY_TAGGED_MIN; - } - else { - *overflow = sign; - /* res is already set to -1 */ - } + // Slow path is moved to a non-inline helper function to + // limit size of generated code + int overflow_local; + res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local); + *overflow = overflow_local; } - exit: return res; } diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index ef81b794c9bd..66b130581cb3 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -58,6 +58,7 @@ def run(self): "list_ops.c", "exc_ops.c", "generic_ops.c", + "pythonsupport.c", ], depends=["CPy.h", "mypyc_util.h", "pythonsupport.h"], extra_compile_args=["-Wno-unused-function", "-Wno-sign-compare"] + compile_args,