From 4e41510b4f4ce71a444df078126b51ab31ac961e Mon Sep 17 00:00:00 2001
From: Mark Shannon <mark@hotpy.org>
Date: Wed, 17 Apr 2024 16:53:45 +0100
Subject: [PATCH] Tidy up tier 2 optimizer. Merge peephole pass into main pass

---
 Python/optimizer_analysis.c  | 163 +++++++++--------------------------
 Python/optimizer_bytecodes.c |  83 ++++++++++++++++--
 Python/optimizer_cases.c.h   |  55 +++++++++++-
 3 files changed, 169 insertions(+), 132 deletions(-)

diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c
index a21679f366a74e..90444e33568b9d 100644
--- a/Python/optimizer_analysis.c
+++ b/Python/optimizer_analysis.c
@@ -362,6 +362,30 @@ eliminate_pop_guard(_PyUOpInstruction *this_instr, bool exit)
     }
 }
 
+/* _PUSH_FRAME/_POP_FRAME's operand can be 0, a PyFunctionObject *, or a
+ * PyCodeObject *. Retrieve the code object if possible.
+ */
+static PyCodeObject *
+get_code(_PyUOpInstruction *op)
+{
+    assert(op->opcode == _PUSH_FRAME || op->opcode == _POP_FRAME);
+    PyCodeObject *co = NULL;
+    uint64_t operand = op->operand;
+    if (operand == 0) {
+        return NULL;
+    }
+    if (operand & 1) {
+        co = (PyCodeObject *)(operand & ~1);
+    }
+    else {
+        PyFunctionObject *func = (PyFunctionObject *)operand;
+        assert(PyFunction_Check(func));
+        co = (PyCodeObject *)func->func_code;
+    }
+    assert(PyCode_Check(co));
+    return co;
+}
+
 /* 1 for success, 0 for not ready, cannot error at the moment. */
 static int
 optimize_uops(
@@ -376,6 +400,10 @@ optimize_uops(
     _Py_UOpsContext context;
     _Py_UOpsContext *ctx = &context;
     uint32_t opcode = UINT16_MAX;
+    int curr_space = 0;
+    int max_space = 0;
+    _PyUOpInstruction *first_valid_check_stack = NULL;
+    _PyUOpInstruction *corresponding_check_stack = NULL;
 
     if (_Py_uop_abstractcontext_init(ctx) < 0) {
         goto out_of_space;
@@ -416,8 +444,7 @@ optimize_uops(
         ctx->frame->stack_pointer = stack_pointer;
         assert(STACK_LEVEL() >= 0);
     }
-    _Py_uop_abstractcontext_fini(ctx);
-    return trace_len;
+    Py_UNREACHABLE();
 
 out_of_space:
     DPRINTF(3, "\n");
@@ -443,9 +470,17 @@ optimize_uops(
     _Py_uop_abstractcontext_fini(ctx);
     return 0;
 done:
-    /* Cannot optimize further, but there would be no benefit
-     * in retrying later */
+    /* Either reached the end or cannot optimize further, but there
+     * would be no benefit in retrying later */
     _Py_uop_abstractcontext_fini(ctx);
+    if (first_valid_check_stack != NULL) {
+        assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE);
+        assert(max_space > 0);
+        assert(max_space <= INT_MAX);
+        assert(max_space <= INT32_MAX);
+        first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND;
+        first_valid_check_stack->operand = max_space;
+    }
     return trace_len;
 }
 
@@ -532,124 +567,6 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
     Py_UNREACHABLE();
 }
 
-/* _PUSH_FRAME/_POP_FRAME's operand can be 0, a PyFunctionObject *, or a
- * PyCodeObject *. Retrieve the code object if possible.
- */
-static PyCodeObject *
-get_co(_PyUOpInstruction *op)
-{
-    assert(op->opcode == _PUSH_FRAME || op->opcode == _POP_FRAME);
-    PyCodeObject *co = NULL;
-    uint64_t operand = op->operand;
-    if (operand == 0) {
-        return NULL;
-    }
-    if (operand & 1) {
-        co = (PyCodeObject *)(operand & ~1);
-    }
-    else {
-        PyFunctionObject *func = (PyFunctionObject *)operand;
-        assert(PyFunction_Check(func));
-        co = (PyCodeObject *)func->func_code;
-    }
-    assert(PyCode_Check(co));
-    return co;
-}
-
-static void
-peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size)
-{
-    PyCodeObject *co = _PyFrame_GetCode(frame);
-    int curr_space = 0;
-    int max_space = 0;
-    _PyUOpInstruction *first_valid_check_stack = NULL;
-    _PyUOpInstruction *corresponding_check_stack = NULL;
-    for (int pc = 0; pc < buffer_size; pc++) {
-        int opcode = buffer[pc].opcode;
-        switch(opcode) {
-            case _LOAD_CONST: {
-                assert(co != NULL);
-                PyObject *val = PyTuple_GET_ITEM(co->co_consts, buffer[pc].oparg);
-                buffer[pc].opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
-                buffer[pc].operand = (uintptr_t)val;
-                break;
-            }
-            case _CHECK_PEP_523: {
-                /* Setting the eval frame function invalidates
-                 * all executors, so no need to check dynamically */
-                if (_PyInterpreterState_GET()->eval_frame == NULL) {
-                    buffer[pc].opcode = _NOP;
-                }
-                break;
-            }
-            case _CHECK_STACK_SPACE: {
-                assert(corresponding_check_stack == NULL);
-                corresponding_check_stack = &buffer[pc];
-                break;
-            }
-            case _PUSH_FRAME: {
-                assert(corresponding_check_stack != NULL);
-                co = get_co(&buffer[pc]);
-                if (co == NULL) {
-                    // should be about to _EXIT_TRACE anyway
-                    goto finish;
-                }
-                int framesize = co->co_framesize;
-                assert(framesize > 0);
-                curr_space += framesize;
-                if (curr_space < 0 || curr_space > INT32_MAX) {
-                    // won't fit in signed 32-bit int
-                    goto finish;
-                }
-                max_space = curr_space > max_space ? curr_space : max_space;
-                if (first_valid_check_stack == NULL) {
-                    first_valid_check_stack = corresponding_check_stack;
-                }
-                else {
-                    // delete all but the first valid _CHECK_STACK_SPACE
-                    corresponding_check_stack->opcode = _NOP;
-                }
-                corresponding_check_stack = NULL;
-                break;
-            }
-            case _POP_FRAME: {
-                assert(corresponding_check_stack == NULL);
-                assert(co != NULL);
-                int framesize = co->co_framesize;
-                assert(framesize > 0);
-                assert(framesize <= curr_space);
-                curr_space -= framesize;
-                co = get_co(&buffer[pc]);
-                if (co == NULL) {
-                    // might be impossible, but bailing is still safe
-                    goto finish;
-                }
-                break;
-            }
-            case _JUMP_TO_TOP:
-            case _EXIT_TRACE:
-                goto finish;
-#ifdef Py_DEBUG
-            case _CHECK_STACK_SPACE_OPERAND: {
-                /* We should never see _CHECK_STACK_SPACE_OPERANDs.
-                 * They are only created at the end of this pass. */
-                Py_UNREACHABLE();
-            }
-#endif
-        }
-    }
-    Py_UNREACHABLE();
-finish:
-    if (first_valid_check_stack != NULL) {
-        assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE);
-        assert(max_space > 0);
-        assert(max_space <= INT_MAX);
-        assert(max_space <= INT32_MAX);
-        first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND;
-        first_valid_check_stack->operand = max_space;
-    }
-}
-
 //  0 - failure, no error raised, just fall back to Tier 1
 // -1 - failure, and raise error
 //  > 0 - length of optimized trace
@@ -669,8 +586,6 @@ _Py_uop_analyze_and_optimize(
         return err;
     }
 
-    peephole_opt(frame, buffer, length);
-
     length = optimize_uops(
         _PyFrame_GetCode(frame), buffer,
         length, curr_stacklen, dependencies);
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index e38428af108893..f119b8e20719fa 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -38,12 +38,14 @@ optimize_to_bool(
     _Py_UopsSymbol **result_ptr);
 
 extern void
-eliminate_pop_guard(_PyUOpInstruction *this_instr, bool exit)
+eliminate_pop_guard(_PyUOpInstruction *this_instr, bool exit);
+
+extern PyCodeObject *get_code(_PyUOpInstruction *op);
 
 static int
 dummy_func(void) {
 
-    PyCodeObject *code;
+    PyCodeObject *co;
     int oparg;
     _Py_UopsSymbol *flag;
     _Py_UopsSymbol *left;
@@ -54,10 +56,15 @@ dummy_func(void) {
     _Py_UopsSymbol *top;
     _Py_UopsSymbol *bottom;
     _Py_UOpsAbstractFrame *frame;
+    _Py_UOpsAbstractFrame *new_frame;
     _Py_UOpsContext *ctx;
     _PyUOpInstruction *this_instr;
     _PyBloomFilter *dependencies;
     int modified;
+    int curr_space;
+    int max_space;
+    _PyUOpInstruction *first_valid_check_stack;
+    _PyUOpInstruction *corresponding_check_stack;
 
 // BEGIN BYTECODES //
 
@@ -393,9 +400,10 @@ dummy_func(void) {
     }
 
     op(_LOAD_CONST, (-- value)) {
-        // There should be no LOAD_CONST. It should be all
-        // replaced by peephole_opt.
-        Py_UNREACHABLE();
+        PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
+        int opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
+        REPLACE_OP(this_instr, opcode, 0, (uintptr_t)val);
+        OUT_OF_SPACE_IF_NULL(value = sym_new_const(ctx, val));
     }
 
     op(_LOAD_CONST_INLINE, (ptr/4 -- value)) {
@@ -590,6 +598,32 @@ dummy_func(void) {
         frame_pop(ctx);
         stack_pointer = ctx->frame->stack_pointer;
         res = retval;
+
+        /* Stack space handling */
+        assert(corresponding_check_stack == NULL);
+        assert(co != NULL);
+        int framesize = co->co_framesize;
+        assert(framesize > 0);
+        assert(framesize <= curr_space);
+        curr_space -= framesize;
+
+        co = get_code(this_instr);
+        if (co == NULL) {
+            // might be impossible, but bailing is still safe
+            goto done;
+        }
+    }
+
+    op(_CHECK_STACK_SPACE, ( --)) {
+        assert(corresponding_check_stack == NULL);
+        corresponding_check_stack = this_instr;
+    }
+
+    op (_CHECK_STACK_SPACE_OPERAND, ( -- )) {
+        (void)framesize;
+        /* We should never see _CHECK_STACK_SPACE_OPERANDs.
+        * They are only created at the end of this pass. */
+        Py_UNREACHABLE();
     }
 
     op(_PUSH_FRAME, (new_frame: _Py_UOpsAbstractFrame * -- unused if (0))) {
@@ -598,6 +632,29 @@ dummy_func(void) {
         ctx->frame = new_frame;
         ctx->curr_frame_depth++;
         stack_pointer = new_frame->stack_pointer;
+        co = get_code(this_instr);
+        if (co == NULL) {
+            // should be about to _EXIT_TRACE anyway
+            goto done;
+        }
+
+        /* Stack space handling */
+        int framesize = co->co_framesize;
+        assert(framesize > 0);
+        curr_space += framesize;
+        if (curr_space < 0 || curr_space > INT32_MAX) {
+            // won't fit in signed 32-bit int
+            goto done;
+        }
+        max_space = curr_space > max_space ? curr_space : max_space;
+        if (first_valid_check_stack == NULL) {
+            first_valid_check_stack = corresponding_check_stack;
+        }
+        else {
+            // delete all but the first valid _CHECK_STACK_SPACE
+            corresponding_check_stack->opcode = _NOP;
+        }
+        corresponding_check_stack = NULL;
     }
 
     op(_UNPACK_SEQUENCE, (seq -- values[oparg])) {
@@ -662,6 +719,22 @@ dummy_func(void) {
         }
     }
 
+    op(_CHECK_PEP_523, (--)) {
+        /* Setting the eval frame function invalidates
+        * all executors, so no need to check dynamically */
+        if (_PyInterpreterState_GET()->eval_frame == NULL) {
+            REPLACE_OP(this_instr, _NOP, 0 ,0);
+        }
+    }
+
+    op(_JUMP_TO_TOP, (--)) {
+        goto done;
+    }
+
+    op(_EXIT_TRACE, (--)) {
+        goto done;
+    }
+
 
 // END BYTECODES //
 
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index 209be370c4aa38..50f335e0c8a0a2 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -46,9 +46,10 @@
 
         case _LOAD_CONST: {
             _Py_UopsSymbol *value;
-            // There should be no LOAD_CONST. It should be all
-            // replaced by peephole_opt.
-            Py_UNREACHABLE();
+            PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
+            int opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
+            REPLACE_OP(this_instr, opcode, 0, (uintptr_t)val);
+            OUT_OF_SPACE_IF_NULL(value = sym_new_const(ctx, val));
             stack_pointer[0] = value;
             stack_pointer += 1;
             break;
@@ -597,6 +598,18 @@
             frame_pop(ctx);
             stack_pointer = ctx->frame->stack_pointer;
             res = retval;
+            /* Stack space handling */
+            assert(corresponding_check_stack == NULL);
+            assert(co != NULL);
+            int framesize = co->co_framesize;
+            assert(framesize > 0);
+            assert(framesize <= curr_space);
+            curr_space -= framesize;
+            co = get_code(this_instr);
+            if (co == NULL) {
+                // might be impossible, but bailing is still safe
+                goto done;
+            }
             stack_pointer[0] = res;
             stack_pointer += 1;
             break;
@@ -1529,6 +1542,11 @@
         }
 
         case _CHECK_PEP_523: {
+            /* Setting the eval frame function invalidates
+             * all executors, so no need to check dynamically */
+            if (_PyInterpreterState_GET()->eval_frame == NULL) {
+                REPLACE_OP(this_instr, _NOP, 0 ,0);
+            }
             break;
         }
 
@@ -1547,6 +1565,8 @@
         }
 
         case _CHECK_STACK_SPACE: {
+            assert(corresponding_check_stack == NULL);
+            corresponding_check_stack = this_instr;
             break;
         }
 
@@ -1610,6 +1630,28 @@
             ctx->frame = new_frame;
             ctx->curr_frame_depth++;
             stack_pointer = new_frame->stack_pointer;
+            co = get_code(this_instr);
+            if (co == NULL) {
+                // should be about to _EXIT_TRACE anyway
+                goto done;
+            }
+            /* Stack space handling */
+            int framesize = co->co_framesize;
+            assert(framesize > 0);
+            curr_space += framesize;
+            if (curr_space < 0 || curr_space > INT32_MAX) {
+                // won't fit in signed 32-bit int
+                goto done;
+            }
+            max_space = curr_space > max_space ? curr_space : max_space;
+            if (first_valid_check_stack == NULL) {
+                first_valid_check_stack = corresponding_check_stack;
+            }
+            else {
+                // delete all but the first valid _CHECK_STACK_SPACE
+                corresponding_check_stack->opcode = _NOP;
+            }
+            corresponding_check_stack = NULL;
             break;
         }
 
@@ -1899,6 +1941,7 @@
         }
 
         case _JUMP_TO_TOP: {
+            goto done;
             break;
         }
 
@@ -1907,6 +1950,11 @@
         }
 
         case _CHECK_STACK_SPACE_OPERAND: {
+            uint32_t framesize = (uint32_t)this_instr->operand;
+            (void)framesize;
+            /* We should never see _CHECK_STACK_SPACE_OPERANDs.
+             * They are only created at the end of this pass. */
+            Py_UNREACHABLE();
             break;
         }
 
@@ -1915,6 +1963,7 @@
         }
 
         case _EXIT_TRACE: {
+            goto done;
             break;
         }