Skip to content

bpo-46841: Use inline caching for attribute accesses #31640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 3, 2022
137 changes: 112 additions & 25 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@ typedef struct {
uint32_t version;
} _PyAdaptiveEntry;


typedef struct {
uint32_t tp_version;
uint32_t dk_version;
} _PyAttrCache;

typedef struct {
/* Borrowed ref in LOAD_METHOD */
/* Borrowed ref */
PyObject *obj;
} _PyObjectCache;

Expand All @@ -51,7 +45,6 @@ typedef struct {
typedef union {
_PyEntryZero zero;
_PyAdaptiveEntry adaptive;
_PyAttrCache attr;
_PyObjectCache obj;
_PyCallCache call;
} SpecializedCacheEntry;
Expand All @@ -65,8 +58,7 @@ typedef union {
typedef struct {
_Py_CODEUNIT counter;
_Py_CODEUNIT index;
_Py_CODEUNIT module_keys_version;
_Py_CODEUNIT _m1;
_Py_CODEUNIT module_keys_version[2];
_Py_CODEUNIT builtin_keys_version;
} _PyLoadGlobalCache;

Expand Down Expand Up @@ -94,13 +86,32 @@ typedef struct {

typedef struct {
_Py_CODEUNIT counter;
_Py_CODEUNIT type_version;
_Py_CODEUNIT _t1;
_Py_CODEUNIT type_version[2];
_Py_CODEUNIT func_version;
} _PyBinarySubscrCache;

#define INLINE_CACHE_ENTRIES_BINARY_SUBSCR CACHE_ENTRIES(_PyBinarySubscrCache)

typedef struct {
_Py_CODEUNIT counter;
_Py_CODEUNIT version[2];
_Py_CODEUNIT index;
} _PyAttrCache;

#define INLINE_CACHE_ENTRIES_LOAD_ATTR CACHE_ENTRIES(_PyAttrCache)

#define INLINE_CACHE_ENTRIES_STORE_ATTR CACHE_ENTRIES(_PyAttrCache)

typedef struct {
_Py_CODEUNIT counter;
_Py_CODEUNIT type_version[2];
_Py_CODEUNIT dict_offset;
_Py_CODEUNIT keys_version[2];
_Py_CODEUNIT descr[4];
} _PyLoadMethodCache;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that LOAD_METHOD_MODULE and LOAD_ATTR_MODULE share code, maybe change this to:

typedef struct {
     _PyAttrCache attr;
    _Py_CODEUNIT keys_version[2];
    _Py_CODEUNIT descr[4];
} _PyLoadMethodCache;

Copy link
Member Author

@brandtbucher brandtbucher Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather leave it as-is. Otherwise, accessing the attr methods, including the counter, is a bit awkward.

In response to your other comments: I don't think we're doing anything illegal here, since we only use one consistent cache layout for each specialized form (meaning, _PyLoadMethodCache and _AttrCache members never alias each other). The exception is the counter member for all instructions, but C guarantees that it the first member of a struct will always be at offset 0. So no issues there.


#define INLINE_CACHE_ENTRIES_LOAD_METHOD CACHE_ENTRIES(_PyLoadMethodCache)

/* Maximum size of code to quicken, in code units. */
#define MAX_SIZE_TO_QUICKEN 5000

Expand Down Expand Up @@ -328,10 +339,13 @@ cache_backoff(_PyAdaptiveEntry *entry) {

/* Specialization functions */

extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name, SpecializedCacheEntry *cache);
extern int _Py_Specialize_StoreAttr(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name, SpecializedCacheEntry *cache);
extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
PyObject *name);
extern int _Py_Specialize_StoreAttr(PyObject *owner, _Py_CODEUNIT *instr,
PyObject *name);
extern int _Py_Specialize_LoadGlobal(PyObject *globals, PyObject *builtins, _Py_CODEUNIT *instr, PyObject *name);
extern int _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name, SpecializedCacheEntry *cache);
extern int _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr,
PyObject *name);
extern int _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container, _Py_CODEUNIT *instr);
extern int _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub, _Py_CODEUNIT *instr);
extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
Expand Down Expand Up @@ -416,34 +430,107 @@ extern PyObject* _Py_GetSpecializationStats(void);
#ifdef WORDS_BIGENDIAN

static inline void
write32(uint16_t *p, uint32_t val)
write_u32(uint16_t *p, uint32_t val)
{
p[0] = val >> 16;
p[1] = (uint16_t)val;
p[0] = (uint16_t)(val >> 16);
p[1] = (uint16_t)(val >> 0);
}

static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 48);
p[1] = (uint16_t)(val >> 32);
p[2] = (uint16_t)(val >> 16);
p[3] = (uint16_t)(val >> 0);
}

static inline uint32_t
read32(uint16_t *p)
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 16;
val |= (uint32_t)p[1] << 0;
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
return (p[0] << 16) | p[1];
uint64_t val = 0;
val |= (uint64_t)p[0] << 48;
val |= (uint64_t)p[1] << 32;
val |= (uint64_t)p[2] << 16;
val |= (uint64_t)p[3] << 0;
return val;
}

#else

static inline void
write32(uint16_t *p, uint32_t val)
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
}

static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)val;
p[1] = val >> 16;
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
p[2] = (uint16_t)(val >> 32);
p[3] = (uint16_t)(val >> 48);
}

static inline uint32_t
read32(uint16_t *p)
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 0;
val |= (uint32_t)p[1] << 16;
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 0;
val |= (uint64_t)p[1] << 16;
val |= (uint64_t)p[2] << 32;
val |= (uint64_t)p[3] << 48;
return val;
}

#endif

static inline void
write_obj(uint16_t *p, PyObject *obj)
{
return p[0] | (p[1] << 16);
uintptr_t val = (uintptr_t)obj;
#if SIZEOF_VOID_P == 8
write_u64(p, val);
#elif SIZEOF_VOID_P == 4
write_u32(p, val);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
}

static inline PyObject *
read_obj(uint16_t *p)
{
uintptr_t val;
#if SIZEOF_VOID_P == 8
val = read_u64(p);
#elif SIZEOF_VOID_P == 4
val = read_u32(p);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
return (PyObject *)val;
}

#ifdef __cplusplus
}
Expand Down
4 changes: 3 additions & 1 deletion Include/opcode.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Lib/importlib/_bootstrap_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,8 @@ def _write_atomic(path, data, mode=0o666):
# Python 3.11a5 3481 (Use inline cache for BINARY_OP)
# Python 3.11a5 3482 (Use inline caching for UNPACK_SEQUENCE and LOAD_GLOBAL)
# Python 3.11a5 3483 (Use inline caching for COMPARE_OP and BINARY_SUBSCR)
# Python 3.11a5 3484 (Use inline caching for LOAD_ATTR, LOAD_METHOD, and
# STORE_ATTR)

# Python 3.12 will start with magic number 3500

Expand All @@ -404,7 +406,7 @@ def _write_atomic(path, data, mode=0o666):
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
# in PC/launcher.c must also be updated.

MAGIC_NUMBER = (3483).to_bytes(2, 'little') + b'\r\n'
MAGIC_NUMBER = (3484).to_bytes(2, 'little') + b'\r\n'
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c

_PYCACHE = '__pycache__'
Expand Down
7 changes: 3 additions & 4 deletions Lib/opcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def jabs_op(name, op, entries=0):
def_op('UNPACK_SEQUENCE', 92, 1) # Number of tuple items
jrel_op('FOR_ITER', 93)
def_op('UNPACK_EX', 94)
name_op('STORE_ATTR', 95) # Index in name list
name_op('STORE_ATTR', 95, 4) # Index in name list
name_op('DELETE_ATTR', 96) # ""
name_op('STORE_GLOBAL', 97) # ""
name_op('DELETE_GLOBAL', 98) # ""
Expand All @@ -124,7 +124,7 @@ def jabs_op(name, op, entries=0):
def_op('BUILD_LIST', 103) # Number of list items
def_op('BUILD_SET', 104) # Number of set items
def_op('BUILD_MAP', 105) # Number of dict entries
name_op('LOAD_ATTR', 106) # Index in name list
name_op('LOAD_ATTR', 106, 4) # Index in name list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existence of the LOAD_MODULE_ATTR_OR_METHOD macro implies that the LOAD_ATTR and LOAD_METHOD caches should have the same layout. Yet, this has size 4 and LOAD_METHOD has size 10.

This may be the source of your problem.
Are you using the two version numbers consistently?

def_op('COMPARE_OP', 107, 2) # Comparison operator
hascompare.append(107)
name_op('IMPORT_NAME', 108) # Index in name list
Expand Down Expand Up @@ -186,7 +186,7 @@ def jabs_op(name, op, entries=0):
def_op('BUILD_CONST_KEY_MAP', 156)
def_op('BUILD_STRING', 157)

name_op('LOAD_METHOD', 160)
name_op('LOAD_METHOD', 160, 10)

def_op('LIST_EXTEND', 162)
def_op('SET_UPDATE', 163)
Expand Down Expand Up @@ -301,7 +301,6 @@ def jabs_op(name, op, entries=0):
"LOAD_FAST__LOAD_CONST",
"LOAD_CONST__LOAD_FAST",
"STORE_FAST__STORE_FAST",
"LOAD_FAST__LOAD_ATTR_INSTANCE_VALUE",
]
_specialization_stats = [
"success",
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_dis.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def bug42562():
>> PUSH_EXC_INFO

%3d LOAD_GLOBAL 0 (Exception)
JUMP_IF_NOT_EXC_MATCH 31 (to 62)
JUMP_IF_NOT_EXC_MATCH 35 (to 70)
STORE_FAST 0 (e)

%3d LOAD_FAST 0 (e)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Use inline caching for :opcode:`LOAD_ATTR`, :opcode:`LOAD_METHOD`, and
:opcode:`STORE_ATTR`.
16 changes: 9 additions & 7 deletions Programs/test_frozenmain.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading