Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-90997: Improve inline cache performance for MSVC #96781

Merged
merged 3 commits into from
Sep 15, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 20 additions & 74 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,110 +285,56 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
#endif // !Py_STATS

// Cache values are only valid in memory, so use native endianness.
#ifdef WORDS_BIGENDIAN
// NOTE: These cache reading/writing utilities use memcpy to avoid voilating C's
// strict aliasing rules, while also avoiding the need to maintain big-endian
// versions of the same code. Compilers are smart enough to understand what
// we're really trying to do here (see https://blog.regehr.org/archives/959).

// When modifying these, great care must be taken to ensure that we don't break
// or slow down our inline caching! All of these functions should compile to
// simple "move" instructions on all supported compilers and platforms. You can
// use the Compiler Explorer at https://godbolt.org to help verify this.

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 16);
p[1] = (uint16_t)(val >> 0);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 48);
p[1] = (uint16_t)(val >> 32);
p[2] = (uint16_t)(val >> 16);
p[3] = (uint16_t)(val >> 0);
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 16;
val |= (uint32_t)p[1] << 0;
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 48;
val |= (uint64_t)p[1] << 32;
val |= (uint64_t)p[2] << 16;
val |= (uint64_t)p[3] << 0;
return val;
}

#else

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
write_obj(uint16_t *p, PyObject *val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
p[2] = (uint16_t)(val >> 32);
p[3] = (uint16_t)(val >> 48);
memcpy(p, &val, sizeof(val));
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 0;
val |= (uint32_t)p[1] << 16;
uint32_t val;
memcpy(&val, p, sizeof(val));
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 0;
val |= (uint64_t)p[1] << 16;
val |= (uint64_t)p[2] << 32;
val |= (uint64_t)p[3] << 48;
uint64_t val;
memcpy(&val, p, sizeof(val));
return val;
}

#endif

static inline void
write_obj(uint16_t *p, PyObject *obj)
{
uintptr_t val = (uintptr_t)obj;
#if SIZEOF_VOID_P == 8
write_u64(p, val);
#elif SIZEOF_VOID_P == 4
write_u32(p, val);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
}

static inline PyObject *
read_obj(uint16_t *p)
{
uintptr_t val;
#if SIZEOF_VOID_P == 8
val = read_u64(p);
#elif SIZEOF_VOID_P == 4
val = read_u32(p);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
return (PyObject *)val;
PyObject *val;
memcpy(&val, p, sizeof(val));
return val;
}

/* See Objects/exception_handling_notes.txt for details.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performance of reading and writing inline bytecode caches on
some platforms.