Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-46841: Use inline caching for calls #31709

Merged
merged 9 commits into from
Mar 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/cpython/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct PyCodeObject {
/* Quickened instructions and cache, or NULL
This should be treated as opaque by all code except the specializer and
interpreter. */
union _cache_or_instruction *co_quickened;
_Py_CODEUNIT *co_quickened;

};

Expand Down
141 changes: 26 additions & 115 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,50 +8,10 @@ extern "C" {
* Specialization and quickening structs and helper functions
*/

typedef struct {
int32_t cache_count;
int32_t _; /* Force 8 byte size */
} _PyEntryZero;

typedef struct {
uint8_t original_oparg;
uint8_t counter;
uint16_t index;
uint32_t version;
} _PyAdaptiveEntry;

typedef struct {
/* Borrowed ref */
PyObject *obj;
} _PyObjectCache;

typedef struct {
uint32_t func_version;
uint16_t min_args;
uint16_t defaults_len;
} _PyCallCache;


/* Add specialized versions of entries to this union.
*
* Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
* Preserving this invariant is necessary because:
- If any one form uses more space, then all must and on 64 bit machines
this is likely to double the memory consumption of caches
- The function for calculating the offset of caches assumes a 4:1
cache:instruction size ratio. Changing that would need careful
analysis to choose a new function.
*/
typedef union {
_PyEntryZero zero;
_PyAdaptiveEntry adaptive;
_PyObjectCache obj;
_PyCallCache call;
} SpecializedCacheEntry;

#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))

/* Inline caches */
// Inline caches. If you change the number of cache entries for an instruction,
// you must *also* update the number of cache entries in Lib/opcode.py and bump
// the magic number in Lib/importlib/_bootstrap_external.py!

#define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT))

Expand Down Expand Up @@ -112,73 +72,22 @@ typedef struct {

#define INLINE_CACHE_ENTRIES_LOAD_METHOD CACHE_ENTRIES(_PyLoadMethodCache)

/* Maximum size of code to quicken, in code units. */
#define MAX_SIZE_TO_QUICKEN 5000

typedef union _cache_or_instruction {
_Py_CODEUNIT code[1];
SpecializedCacheEntry entry;
} SpecializedCacheOrInstruction;
typedef struct {
_Py_CODEUNIT counter;
_Py_CODEUNIT func_version[2];
_Py_CODEUNIT min_args;
} _PyCallCache;

/* Get pointer to the nth cache entry, from the first instruction and n.
* Cache entries are indexed backwards, with [count-1] first in memory, and [0] last.
* The zeroth entry immediately precedes the instructions.
*/
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntry(const _Py_CODEUNIT *first_instr, Py_ssize_t n)
{
SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
assert(&last_cache_plus_one->code[0] == first_instr);
return &last_cache_plus_one[-1-n].entry;
}
#define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache)

/* Following two functions form a pair.
*
* oparg_from_offset_and_index() is used to compute the oparg
* when quickening, so that offset_from_oparg_and_nexti()
* can be used at runtime to compute the offset.
*
* The relationship between the three values is currently
* offset == (index>>1) + oparg
* This relation is chosen based on the following observations:
* 1. typically 1 in 4 instructions need a cache
* 2. instructions that need a cache typically use 2 entries
* These observations imply: offset ≈ index/2
* We use the oparg to fine tune the relation to avoid wasting space
* and allow consecutive instructions to use caches.
*
* If the number of cache entries < number of instructions/2 we will waste
* some small amoount of space.
* If the number of cache entries > (number of instructions/2) + 255, then
* some instructions will not be able to use a cache.
* In practice, we expect some small amount of wasted space in a shorter functions
* and only functions exceeding a 1000 lines or more not to have enugh cache space.
*
*/
static inline int
oparg_from_offset_and_nexti(int offset, int nexti)
{
return offset-(nexti>>1);
}
typedef struct {
_Py_CODEUNIT counter;
} _PyPrecallCache;

static inline int
offset_from_oparg_and_nexti(int oparg, int nexti)
{
return (nexti>>1)+oparg;
}
#define INLINE_CACHE_ENTRIES_PRECALL CACHE_ENTRIES(_PyPrecallCache)

/* Get pointer to the cache entry associated with an instruction.
* nexti is the index of the instruction plus one.
* nexti is used as it corresponds to the instruction pointer in the interpreter.
* This doesn't check that an entry has been allocated for that instruction. */
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntryForInstruction(const _Py_CODEUNIT *first_instr, int nexti, int oparg)
{
return _GetSpecializedCacheEntry(
first_instr,
offset_from_oparg_and_nexti(oparg, nexti)
);
}
/* Maximum size of code to quicken, in code units. */
#define MAX_SIZE_TO_QUICKEN 10000
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just to get the unpack sequence benchmark to work again, or something else?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, just for unpack_sequence.


#define QUICKENING_WARMUP_DELAY 8

Expand All @@ -205,6 +114,13 @@ _Py_IncrementCountAndMaybeQuicken(PyCodeObject *code)

extern Py_ssize_t _Py_QuickenedCount;

// Borrowed references to common callables:
struct callable_cache {
PyObject *isinstance;
PyObject *len;
PyObject *list_append;
};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the existence of PyList_Type as part of the API means that list.append must be per-process unique.
In other words, list_append could be static.

I'm happy to leave it as is for now, though. We should look to make the whole struct static, although the mutability of builtin functions makes that tricky for isinstance and len.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe each interpreter has its own builtins module (check out _PyBuiltin_Init), so making this static could be tricky. As you said, though: probably worth looking into in the future.


/* "Locals plus" for a code object is the set of locals + cell vars +
* free vars. This relates to variable names as well as offsets into
* the "fast locals" storage array of execution frames. The compiler
Expand Down Expand Up @@ -332,11 +248,6 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);

#define ADAPTIVE_CACHE_BACKOFF 64

static inline void
cache_backoff(_PyAdaptiveEntry *entry) {
entry->counter = ADAPTIVE_CACHE_BACKOFF;
}

/* Specialization functions */

extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
Expand All @@ -348,10 +259,10 @@ extern int _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr,
PyObject *name);
extern int _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container, _Py_CODEUNIT *instr);
extern int _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub, _Py_CODEUNIT *instr);
extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
PyObject *kwnames, SpecializedCacheEntry *cache);
extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
PyObject *kwnames, SpecializedCacheEntry *cache, PyObject *builtins);
extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr,
int nargs, PyObject *kwnames);
extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr,
int nargs, PyObject *kwnames, int oparg);
extern void _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr,
int oparg);
extern void _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs,
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(inf)
STRUCT_FOR_ID(intersection)
STRUCT_FOR_ID(isatty)
STRUCT_FOR_ID(isinstance)
STRUCT_FOR_ID(items)
STRUCT_FOR_ID(iter)
STRUCT_FOR_ID(join)
Expand All @@ -278,6 +279,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(last_type)
STRUCT_FOR_ID(last_value)
STRUCT_FOR_ID(latin1)
STRUCT_FOR_ID(len)
STRUCT_FOR_ID(line)
STRUCT_FOR_ID(lineno)
STRUCT_FOR_ID(listcomp)
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ extern "C" {

#include "pycore_atomic.h" // _Py_atomic_address
#include "pycore_ast_state.h" // struct ast_state
#include "pycore_code.h" // struct callable_cache
#include "pycore_context.h" // struct _Py_context_state
#include "pycore_dict.h" // struct _Py_dict_state
#include "pycore_exceptions.h" // struct _Py_exc_state
Expand Down Expand Up @@ -176,6 +177,7 @@ struct _is {

struct ast_state ast;
struct type_cache type_cache;
struct callable_cache callable_cache;

/* The following fields are here to avoid allocation during init.
The data is exposed through PyInterpreterState pointer fields.
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,7 @@ extern "C" {
INIT_ID(inf), \
INIT_ID(intersection), \
INIT_ID(isatty), \
INIT_ID(isinstance), \
INIT_ID(items), \
INIT_ID(iter), \
INIT_ID(join), \
Expand All @@ -893,6 +894,7 @@ extern "C" {
INIT_ID(last_type), \
INIT_ID(last_value), \
INIT_ID(latin1), \
INIT_ID(len), \
INIT_ID(line), \
INIT_ID(lineno), \
INIT_ID(listcomp), \
Expand Down
Loading