Skip to content

Commit

Permalink
gh-104584: Baby steps towards generating and executing traces (#105924)
Browse files Browse the repository at this point in the history
Added a new, experimental, tracing optimizer and interpreter (a.k.a. "tier 2"). This currently pessimizes, so don't use yet -- this is infrastructure so we can experiment with optimizing passes. To enable it, pass ``-Xuops`` or set ``PYTHONUOPS=1``. To get debug output, set ``PYTHONUOPSDEBUG=N`` where ``N`` is a debug level (0-4, where 0 is no debug output and 4 is excessively verbose).

All of this code is likely to change dramatically before the 3.13 feature freeze. But this is a first step.
  • Loading branch information
gvanrossum authored Jun 27, 2023
1 parent d3af83b commit 51fc725
Show file tree
Hide file tree
Showing 21 changed files with 2,559 additions and 305 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ Parser/token.c generated
Programs/test_frozenmain.h generated
Python/Python-ast.c generated
Python/generated_cases.c.h generated
Python/executor_cases.c.h generated
Python/opcode_targets.h generated
Python/stdlib_module_names.h generated
Tools/peg_generator/pegen/grammar_parser.py generated
Expand Down
1 change: 1 addition & 0 deletions Include/cpython/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ extern _PyOptimizerObject _PyOptimizer_Default;

/* For testing */
PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void);
PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);

#define OPTIMIZER_BITS_IN_COUNTER 4

Expand Down
31 changes: 31 additions & 0 deletions Include/internal/pycore_uops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef Py_INTERNAL_UOPS_H
#define Py_INTERNAL_UOPS_H
#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#define _Py_UOP_MAX_TRACE_LENGTH 16

typedef struct {
int opcode;
uint64_t operand; // Sometimes oparg, sometimes a cache entry
} _PyUOpInstruction;

typedef struct {
_PyExecutorObject base;
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; // TODO: variable length
} _PyUOpExecutorObject;

_PyInterpreterFrame *_PyUopExecute(
_PyExecutorObject *executor,
_PyInterpreterFrame *frame,
PyObject **stack_pointer);

#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_UOPS_H */
3 changes: 3 additions & 0 deletions Include/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ typedef struct _object_stats {
uint64_t type_cache_dunder_misses;
uint64_t type_cache_collisions;
uint64_t optimization_attempts;
uint64_t optimization_traces_created;
uint64_t optimization_traces_executed;
uint64_t optimization_uops_executed;
} ObjectStats;

typedef struct _stats {
Expand Down
21 changes: 9 additions & 12 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1542,19 +1542,9 @@ regen-opcode-targets:

.PHONY: regen-cases
regen-cases:
# Regenerate Python/generated_cases.c.h
# and Python/opcode_metadata.h
# from Python/bytecodes.c
# using Tools/cases_generator/generate_cases.py
# Regenerate various files from Python/bytecodes.c
PYTHONPATH=$(srcdir)/Tools/cases_generator \
$(PYTHON_FOR_REGEN) \
$(srcdir)/Tools/cases_generator/generate_cases.py \
--emit-line-directives \
-o $(srcdir)/Python/generated_cases.c.h.new \
-m $(srcdir)/Python/opcode_metadata.h.new \
$(srcdir)/Python/bytecodes.c
$(UPDATE_FILE) $(srcdir)/Python/generated_cases.c.h $(srcdir)/Python/generated_cases.c.h.new
$(UPDATE_FILE) $(srcdir)/Python/opcode_metadata.h $(srcdir)/Python/opcode_metadata.h.new
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/cases_generator/generate_cases.py -l

Python/compile.o: $(srcdir)/Python/opcode_metadata.h

Expand All @@ -1565,6 +1555,13 @@ Python/ceval.o: \
$(srcdir)/Python/opcode_metadata.h \
$(srcdir)/Python/opcode_targets.h

Python/flowgraph.o: \
$(srcdir)/Python/opcode_metadata.h

Python/optimizer.o: \
$(srcdir)/Python/executor_cases.c.h \
$(srcdir)/Python/opcode_metadata.h

Python/frozen.o: $(FROZEN_FILES_OUT)

# Generate DTrace probe macros, then rename them (PYTHON_ -> PyDTrace_) to
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added a new, experimental, tracing optimizer and interpreter (a.k.a. "tier 2"). This currently pessimizes, so don't use yet -- this is infrastructure so we can experiment with optimizing passes. To enable it, pass ``-Xuops`` or set ``PYTHONUOPS=1``. To get debug output, set ``PYTHONUOPSDEBUG=N`` where ``N`` is a debug level (0-4, where 0 is no debug output and 4 is excessively verbose).
7 changes: 7 additions & 0 deletions Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,12 @@ get_counter_optimizer(PyObject *self, PyObject *arg)
return PyUnstable_Optimizer_NewCounter();
}

static PyObject *
get_uop_optimizer(PyObject *self, PyObject *arg)
{
return PyUnstable_Optimizer_NewUOpOptimizer();
}

static PyObject *
set_optimizer(PyObject *self, PyObject *opt)
{
Expand Down Expand Up @@ -994,6 +1000,7 @@ static PyMethodDef module_functions[] = {
{"get_optimizer", get_optimizer, METH_NOARGS, NULL},
{"set_optimizer", set_optimizer, METH_O, NULL},
{"get_counter_optimizer", get_counter_optimizer, METH_NOARGS, NULL},
{"get_uop_optimizer", get_uop_optimizer, METH_NOARGS, NULL},
{"pending_threadfunc", _PyCFunction_CAST(pending_threadfunc),
METH_VARARGS | METH_KEYWORDS},
// {"pending_fd_identify", pending_fd_identify, METH_VARARGS, NULL},
Expand Down
4 changes: 1 addition & 3 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@
#define family(name, ...) static int family_##name
#define pseudo(name) static int pseudo_##name

typedef PyObject *(*convertion_func_ptr)(PyObject *);

// Dummy variables for stack effects.
static PyObject *value, *value1, *value2, *left, *right, *res, *sum, *prod, *sub;
static PyObject *container, *start, *stop, *v, *lhs, *rhs, *res2;
Expand Down Expand Up @@ -2182,7 +2180,7 @@ dummy_func(
frame = executor->execute(executor, frame, stack_pointer);
if (frame == NULL) {
frame = cframe.current_frame;
goto error;
goto resume_with_error;
}
goto resume_frame;
}
Expand Down
137 changes: 129 additions & 8 deletions Python/ceval.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "pycore_sysmodule.h" // _PySys_Audit()
#include "pycore_tuple.h" // _PyTuple_ITEMS()
#include "pycore_typeobject.h" // _PySuper_Lookup()
#include "pycore_uops.h" // _PyUOpExecutorObject
#include "pycore_emscripten_signal.h" // _Py_CHECK_EMSCRIPTEN_SIGNALS

#include "pycore_dict.h"
Expand Down Expand Up @@ -223,14 +224,6 @@ _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func,
static void
_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame);

typedef PyObject *(*convertion_func_ptr)(PyObject *);

static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = {
[FVC_STR] = PyObject_Str,
[FVC_REPR] = PyObject_Repr,
[FVC_ASCII] = PyObject_ASCII
};

#define UNBOUNDLOCAL_ERROR_MSG \
"cannot access local variable '%s' where it is not associated with a value"
#define UNBOUNDFREE_ERROR_MSG \
Expand Down Expand Up @@ -2771,3 +2764,131 @@ void Py_LeaveRecursiveCall(void)
{
_Py_LeaveRecursiveCall();
}

///////////////////// Experimental UOp Interpreter /////////////////////

// UPDATE_MISS_STATS (called by DEOPT_IF) uses next_instr
// TODO: Make it do something useful
#undef UPDATE_MISS_STATS
#define UPDATE_MISS_STATS(INSTNAME) ((void)0)

_PyInterpreterFrame *
_PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer)
{
#ifdef LLTRACE
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 2) {
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *instr = frame->prev_instr + 1;
fprintf(stderr,
"Entering _PyUopExecute for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#endif

PyThreadState *tstate = _PyThreadState_GET();
_PyUOpExecutorObject *self = (_PyUOpExecutorObject *)executor;

// Equivalent to CHECK_EVAL_BREAKER()
_Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY();
if (_Py_atomic_load_relaxed_int32(&tstate->interp->ceval.eval_breaker)) {
if (_Py_HandlePending(tstate) != 0) {
goto error;
}
}

OBJECT_STAT_INC(optimization_traces_executed);
_Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive - 1;
int pc = 0;
int opcode;
uint64_t operand;
int oparg;
for (;;) {
opcode = self->trace[pc].opcode;
operand = self->trace[pc].operand;
oparg = (int)operand;
#ifdef LLTRACE
if (lltrace >= 3) {
const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : "";
int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
fprintf(stderr, " uop %s %d, operand %" PRIu64 ", stack_level %d\n",
opname, opcode, operand, stack_level);
}
#endif
pc++;
OBJECT_STAT_INC(optimization_uops_executed);
switch (opcode) {

#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0
#include "executor_cases.c.h"

case SET_IP:
{
frame->prev_instr = ip_offset + oparg;
break;
}

case EXIT_TRACE:
{
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;
}

default:
{
fprintf(stderr, "Unknown uop %d, operand %" PRIu64 "\n", opcode, operand);
Py_FatalError("Unknown uop");
abort(); // Unreachable
for (;;) {}
// Really unreachable
}

}
}

pop_4_error:
STACK_SHRINK(1);
pop_3_error:
STACK_SHRINK(1);
pop_2_error:
STACK_SHRINK(1);
pop_1_error:
STACK_SHRINK(1);
error:
// On ERROR_IF we return NULL as the frame.
// The caller recovers the frame from cframe.current_frame.
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return NULL;

PREDICTED(UNPACK_SEQUENCE)
PREDICTED(COMPARE_OP)
PREDICTED(LOAD_SUPER_ATTR)
PREDICTED(STORE_SUBSCR)
PREDICTED(BINARY_SUBSCR)
PREDICTED(BINARY_OP)
// On DEOPT_IF we just repeat the last instruction.
// This presumes nothing was popped from the stack (nor pushed).
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;
}
10 changes: 9 additions & 1 deletion Python/ceval_macros.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Macros needed by ceval.c and bytecodes.c
// Macros and other things needed by ceval.c and bytecodes.c

/* Computed GOTOs, or
the-optimization-commonly-but-improperly-known-as-"threaded code"
Expand Down Expand Up @@ -339,3 +339,11 @@ do { \
goto error; \
} \
} while (0);

typedef PyObject *(*convertion_func_ptr)(PyObject *);

static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = {
[FVC_STR] = PyObject_Str,
[FVC_REPR] = PyObject_Repr,
[FVC_ASCII] = PyObject_ASCII
};
Loading

0 comments on commit 51fc725

Please sign in to comment.