From eaa91eda0fa1e4478ac911e03cd6b8d8244d8536 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 12 Sep 2022 15:13:28 -0700 Subject: [PATCH 1/3] Use memcpy for cache reads/writes --- Include/internal/pycore_code.h | 94 ++++++++-------------------------- 1 file changed, 20 insertions(+), 74 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 7d5fe941fcb4d0..57c9e07618ec6e 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -285,110 +285,56 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0) #endif // !Py_STATS -// Cache values are only valid in memory, so use native endianness. -#ifdef WORDS_BIGENDIAN +// NOTE: These cache reading/writing utilities use memcpy to avoid voilating C's +// strict aliasing rules, while also avoiding the need to maintain big-endian +// versions of the same code. Compilers are smart enough to understand what +// we're really trying to do here (see https://blog.regehr.org/archives/959). + +// When modifying these, great care must be taken to ensure that we don't break +// or slow down our inline caching! All of these functions should compile to +// simple "move" instructions on all supported compilers and platforms. You can +// use the Compiler Explorer at https://godbolt.org to help verify this. static inline void write_u32(uint16_t *p, uint32_t val) { - p[0] = (uint16_t)(val >> 16); - p[1] = (uint16_t)(val >> 0); + memcpy(p, &val, sizeof(val)); } static inline void write_u64(uint16_t *p, uint64_t val) { - p[0] = (uint16_t)(val >> 48); - p[1] = (uint16_t)(val >> 32); - p[2] = (uint16_t)(val >> 16); - p[3] = (uint16_t)(val >> 0); -} - -static inline uint32_t -read_u32(uint16_t *p) -{ - uint32_t val = 0; - val |= (uint32_t)p[0] << 16; - val |= (uint32_t)p[1] << 0; - return val; -} - -static inline uint64_t -read_u64(uint16_t *p) -{ - uint64_t val = 0; - val |= (uint64_t)p[0] << 48; - val |= (uint64_t)p[1] << 32; - val |= (uint64_t)p[2] << 16; - val |= (uint64_t)p[3] << 0; - return val; -} - -#else - -static inline void -write_u32(uint16_t *p, uint32_t val) -{ - p[0] = (uint16_t)(val >> 0); - p[1] = (uint16_t)(val >> 16); + memcpy(p, &val, sizeof(val)); } static inline void -write_u64(uint16_t *p, uint64_t val) +write_obj(uint16_t *p, PyObject *val) { - p[0] = (uint16_t)(val >> 0); - p[1] = (uint16_t)(val >> 16); - p[2] = (uint16_t)(val >> 32); - p[3] = (uint16_t)(val >> 48); + memcpy(p, &val, sizeof(val)); } static inline uint32_t read_u32(uint16_t *p) { - uint32_t val = 0; - val |= (uint32_t)p[0] << 0; - val |= (uint32_t)p[1] << 16; + uint32_t val; + memcpy(&val, p, sizeof(val)); return val; } static inline uint64_t read_u64(uint16_t *p) { - uint64_t val = 0; - val |= (uint64_t)p[0] << 0; - val |= (uint64_t)p[1] << 16; - val |= (uint64_t)p[2] << 32; - val |= (uint64_t)p[3] << 48; + uint64_t val; + memcpy(&val, p, sizeof(val)); return val; } -#endif - -static inline void -write_obj(uint16_t *p, PyObject *obj) -{ - uintptr_t val = (uintptr_t)obj; -#if SIZEOF_VOID_P == 8 - write_u64(p, val); -#elif SIZEOF_VOID_P == 4 - write_u32(p, val); -#else - #error "SIZEOF_VOID_P must be 4 or 8" -#endif -} - static inline PyObject * read_obj(uint16_t *p) { - uintptr_t val; -#if SIZEOF_VOID_P == 8 - val = read_u64(p); -#elif SIZEOF_VOID_P == 4 - val = read_u32(p); -#else - #error "SIZEOF_VOID_P must be 4 or 8" -#endif - return (PyObject *)val; + PyObject *val; + memcpy(&val, p, sizeof(val)); + return val; } /* See Objects/exception_handling_notes.txt for details. From ae8dfea5cccd6955502beadfd02d90e83b3150b0 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 12 Sep 2022 15:15:11 -0700 Subject: [PATCH 2/3] blurb add --- .../2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst new file mode 100644 index 00000000000000..4a43e2babcdef8 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst @@ -0,0 +1,2 @@ +Improve the performance of reading and writing inline bytecode caches on +some platforms. From 5985a4abaf326193d270570e74b72f3a321d806d Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 13 Sep 2022 21:29:29 -0700 Subject: [PATCH 3/3] Update comment --- Include/internal/pycore_code.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 57c9e07618ec6e..bf5945435c1774 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -285,15 +285,14 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0) #endif // !Py_STATS -// NOTE: These cache reading/writing utilities use memcpy to avoid voilating C's -// strict aliasing rules, while also avoiding the need to maintain big-endian -// versions of the same code. Compilers are smart enough to understand what -// we're really trying to do here (see https://blog.regehr.org/archives/959). - -// When modifying these, great care must be taken to ensure that we don't break -// or slow down our inline caching! All of these functions should compile to -// simple "move" instructions on all supported compilers and platforms. You can -// use the Compiler Explorer at https://godbolt.org to help verify this. +// Utility functions for reading/writing 32/64-bit values in the inline caches. +// Great care should be taken to ensure that these functions remain correct and +// performant! They should compile to just "move" instructions on all supported +// compilers and platforms. + +// We use memcpy to let the C compiler handle unaligned accesses and endianness +// issues for us. It also seems to produce better code than manual copying for +// most compilers (see https://blog.regehr.org/archives/959 for more info). static inline void write_u32(uint16_t *p, uint32_t val)