From 5f53ebd3c1a47471d2b5b6d8ca68bc3a3e488323 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 1 Feb 2024 20:21:42 +0000
Subject: [PATCH 1/3] gh-112529: Make the GC scheduling thread-safe

The GC keeps track of the number of allocations (less deallocations)
since the last GC. This buffers the count in thread-local state and uses
atomic operations to modify the per-interpreter count. The thread-local
buffering avoids contention on shared state.

A consequence is that the GC scheduling is not as precise, so
"test_sneaky_frame_object" is skipped because it requires that the GC be
run exactly after allocating a frame object.
---
 Include/internal/pycore_gc.h     |  7 ++++
 Include/internal/pycore_tstate.h |  1 +
 Lib/test/test_frame.py           |  3 +-
 Modules/gcmodule.c               | 10 +++++
 Objects/typeobject.c             |  2 +
 Python/gc_free_threading.c       | 63 ++++++++++++++++++++++++--------
 6 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
index b362a294a59042..f9d2e8ce7595c6 100644
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@@ -243,6 +243,13 @@ struct _gc_runtime_state {
     Py_ssize_t long_lived_pending;
 };
 
+#ifdef Py_GIL_DISABLED
+struct _gc_thread_state {
+    /* Thread-local allocation count. */
+    Py_ssize_t alloc_count;
+};
+#endif
+
 
 extern void _PyGC_InitState(struct _gc_runtime_state *);
 
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 472fa08154e8f9..84a3b59d2442e0 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -20,6 +20,7 @@ typedef struct _PyThreadStateImpl {
     PyThreadState base;
 
 #ifdef Py_GIL_DISABLED
+    struct _gc_thread_state gc;
     struct _mimalloc_thread_state mimalloc;
     struct _Py_freelist_state freelist_state;
 #endif
diff --git a/Lib/test/test_frame.py b/Lib/test/test_frame.py
index 244ce8af7cdf08..4789c17e07e189 100644
--- a/Lib/test/test_frame.py
+++ b/Lib/test/test_frame.py
@@ -13,7 +13,7 @@
     _testcapi = None
 
 from test import support
-from test.support import threading_helper
+from test.support import threading_helper, Py_GIL_DISABLED
 from test.support.script_helper import assert_python_ok
 
 
@@ -295,6 +295,7 @@ def gen():
         assert_python_ok("-c", code)
 
     @support.cpython_only
+    @unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling")
     def test_sneaky_frame_object(self):
 
         def trace(frame, event, arg):
diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c
index ffddef34ecce7a..df324b510969de 100644
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module)
 /*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/
 {
     GCState *gcstate = get_gc_state();
+
+#ifdef Py_GIL_DISABLED
+    _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+    struct _gc_thread_state *gc = &tstate->gc;
+
+    // Flush the local allocation count to the global count
+    _Py_atomic_add_int(&gcstate->generations[0].count, gc->alloc_count);
+    gc->alloc_count = 0;
+#endif
+
     return Py_BuildValue("(iii)",
                          gcstate->generations[0].count,
                          gcstate->generations[1].count,
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index a850473cad813d..7937f0911f9a61 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1739,6 +1739,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems)
     if (presize) {
         ((PyObject **)alloc)[0] = NULL;
         ((PyObject **)alloc)[1] = NULL;
+    }
+    if (PyType_IS_GC(type)) {
         _PyObject_GC_Link(obj);
     }
     memset(obj, '\0', size);
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 53f927bfa65310..647d2e2861371a 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -22,6 +22,11 @@ typedef struct _gc_runtime_state GCState;
 #  define GC_DEBUG
 #endif
 
+// Each thread buffers the count of allocated objects in a thread-local
+// variable up to +/- this amount to reduce the overhead of updating
+// the global count.
+#define LOCAL_ALLOC_COUNT_THRESHOLD 512
+
 // Automatically choose the generation that needs collecting.
 #define GENERATION_AUTO (-1)
 
@@ -923,6 +928,41 @@ gc_should_collect(GCState *gcstate)
             gcstate->generations[1].threshold == 0);
 }
 
+static void
+record_allocation(PyThreadState *tstate)
+{
+    struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+    // We buffer the allocation count to avoid the overhead of atomic
+    // operations for every allocation.
+    gc->alloc_count++;
+    if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) {
+        // TODO: Use Py_ssize_t for the generation count.
+        GCState *gcstate = &tstate->interp->gc;
+        _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+        gc->alloc_count = 0;
+
+        if (gc_should_collect(gcstate) &&
+            !_Py_atomic_load_int_relaxed(&gcstate->collecting))
+        {
+            _Py_ScheduleGC(tstate->interp);
+        }
+    }
+}
+
+static void
+record_deallocation(PyThreadState *tstate)
+{
+    struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+    gc->alloc_count--;
+    if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) {
+        GCState *gcstate = &tstate->interp->gc;
+        _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+        gc->alloc_count = 0;
+    }
+}
+
 static void
 gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
 {
@@ -942,6 +982,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
         }
     }
 
+    // Record the number of live GC objects
+    interp->gc.long_lived_total = state->long_lived_total;
+
     // Clear weakrefs and enqueue callbacks (but do not call them).
     clear_weakrefs(state);
     _PyEval_StartTheWorld(interp);
@@ -1048,7 +1091,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
 
     m = state.collected;
     n = state.uncollectable;
-    gcstate->long_lived_total = state.long_lived_total;
 
     if (gcstate->debug & _PyGC_DEBUG_STATS) {
         double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
@@ -1488,15 +1530,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
 void
 _PyObject_GC_Link(PyObject *op)
 {
-    PyThreadState *tstate = _PyThreadState_GET();
-    GCState *gcstate = &tstate->interp->gc;
-    gcstate->generations[0].count++;
-
-    if (gc_should_collect(gcstate) &&
-        !_Py_atomic_load_int_relaxed(&gcstate->collecting))
-    {
-        _Py_ScheduleGC(tstate->interp);
-    }
+    record_allocation(_PyThreadState_GET());
 }
 
 void
@@ -1522,7 +1556,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
         ((PyObject **)mem)[1] = NULL;
     }
     PyObject *op = (PyObject *)(mem + presize);
-    _PyObject_GC_Link(op);
+    record_allocation(tstate);
     return op;
 }
 
@@ -1604,10 +1638,9 @@ PyObject_GC_Del(void *op)
         PyErr_SetRaisedException(exc);
 #endif
     }
-    GCState *gcstate = get_gc_state();
-    if (gcstate->generations[0].count > 0) {
-        gcstate->generations[0].count--;
-    }
+
+    record_deallocation(_PyThreadState_GET());
+
     PyObject_Free(((char *)op)-presize);
 }
 

From e832dd93dfaa3a2e2a8fbf5a0cad48852d58848a Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 2 Feb 2024 21:11:59 +0000
Subject: [PATCH 2/3] Fix warning

---
 Modules/gcmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c
index df324b510969de..d03f5fc18394c5 100644
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@@ -207,7 +207,7 @@ gc_get_count_impl(PyObject *module)
     struct _gc_thread_state *gc = &tstate->gc;
 
     // Flush the local allocation count to the global count
-    _Py_atomic_add_int(&gcstate->generations[0].count, gc->alloc_count);
+    _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
     gc->alloc_count = 0;
 #endif
 

From 456b778adc35211a9a3f5c773b349396d2b26816 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 6 Feb 2024 23:10:12 +0000
Subject: [PATCH 3/3] Skip test_gc.test_get_count() in  builds

---
 Lib/test/test_gc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py
index 0002852fce9643..6d1af7e4d77e6c 100644
--- a/Lib/test/test_gc.py
+++ b/Lib/test/test_gc.py
@@ -363,6 +363,7 @@ def __del__(self):
     # To minimize variations, though, we first store the get_count() results
     # and check them at the end.
     @refcount_test
+    @unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts')
     def test_get_count(self):
         gc.collect()
         a, b, c = gc.get_count()