python · brandtbucher · Sep 15, 2022 · Sep 12, 2022 · Sep 12, 2022 · Sep 14, 2022
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
@@ -285,110 +285,56 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
 #endif  // !Py_STATS
 
-// Cache values are only valid in memory, so use native endianness.
-#ifdef WORDS_BIGENDIAN
+// NOTE: These cache reading/writing utilities use memcpy to avoid voilating C's
+// strict aliasing rules, while also avoiding the need to maintain big-endian
+// versions of the same code. Compilers are smart enough to understand what
+// we're really trying to do here (see https://blog.regehr.org/archives/959).
+
+// When modifying these, great care must be taken to ensure that we don't break
+// or slow down our inline caching! All of these functions should compile to
+// simple "move" instructions on all supported compilers and platforms. You can
+// use the Compiler Explorer at https://godbolt.org to help verify this.
 
 static inline void
 write_u32(uint16_t *p, uint32_t val)
 {
-    p[0] = (uint16_t)(val >> 16);
-    p[1] = (uint16_t)(val >>  0);
+    memcpy(p, &val, sizeof(val));
 }
 
 static inline void
 write_u64(uint16_t *p, uint64_t val)
 {
-    p[0] = (uint16_t)(val >> 48);
-    p[1] = (uint16_t)(val >> 32);
-    p[2] = (uint16_t)(val >> 16);
-    p[3] = (uint16_t)(val >>  0);
-}
-
-static inline uint32_t
-read_u32(uint16_t *p)
-{
-    uint32_t val = 0;
-    val |= (uint32_t)p[0] << 16;
-    val |= (uint32_t)p[1] <<  0;
-    return val;
-}
-
-static inline uint64_t
-read_u64(uint16_t *p)
-{
-    uint64_t val = 0;
-    val |= (uint64_t)p[0] << 48;
-    val |= (uint64_t)p[1] << 32;
-    val |= (uint64_t)p[2] << 16;
-    val |= (uint64_t)p[3] <<  0;
-    return val;
-}
-
-#else
-
-static inline void
-write_u32(uint16_t *p, uint32_t val)
-{
-    p[0] = (uint16_t)(val >>  0);
-    p[1] = (uint16_t)(val >> 16);
+    memcpy(p, &val, sizeof(val));
 }
 
 static inline void
-write_u64(uint16_t *p, uint64_t val)
+write_obj(uint16_t *p, PyObject *val)
 {
-    p[0] = (uint16_t)(val >>  0);
-    p[1] = (uint16_t)(val >> 16);
-    p[2] = (uint16_t)(val >> 32);
-    p[3] = (uint16_t)(val >> 48);
+    memcpy(p, &val, sizeof(val));
 }
 
 static inline uint32_t
 read_u32(uint16_t *p)
 {
-    uint32_t val = 0;
-    val |= (uint32_t)p[0] <<  0;
-    val |= (uint32_t)p[1] << 16;
+    uint32_t val;
+    memcpy(&val, p, sizeof(val));
     return val;
 }
 
 static inline uint64_t
 read_u64(uint16_t *p)
 {
-    uint64_t val = 0;
-    val |= (uint64_t)p[0] <<  0;
-    val |= (uint64_t)p[1] << 16;
-    val |= (uint64_t)p[2] << 32;
-    val |= (uint64_t)p[3] << 48;
+    uint64_t val;
+    memcpy(&val, p, sizeof(val));
     return val;
 }
 
-#endif
-
-static inline void
-write_obj(uint16_t *p, PyObject *obj)
-{
-    uintptr_t val = (uintptr_t)obj;
-#if SIZEOF_VOID_P == 8
-    write_u64(p, val);
-#elif SIZEOF_VOID_P == 4
-    write_u32(p, val);
-#else
-    #error "SIZEOF_VOID_P must be 4 or 8"
-#endif
-}
-
 static inline PyObject *
 read_obj(uint16_t *p)
 {
-    uintptr_t val;
-#if SIZEOF_VOID_P == 8
-    val = read_u64(p);
-#elif SIZEOF_VOID_P == 4
-    val = read_u32(p);
-#else
-    #error "SIZEOF_VOID_P must be 4 or 8"
-#endif
-    return (PyObject *)val;
+    PyObject *val;
+    memcpy(&val, p, sizeof(val));
+    return val;
 }
 
 /* See Objects/exception_handling_notes.txt for details.

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-12-15-15-04.gh-issue-90997.sZO8c9.rst
@@ -0,0 +1,2 @@
+Improve the performance of reading and writing inline bytecode caches on
+some platforms.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Improve the performance of reading and writing inline bytecode caches on
		some platforms.