From 5f53ebd3c1a47471d2b5b6d8ca68bc3a3e488323 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 1 Feb 2024 20:21:42 +0000 Subject: [PATCH 1/3] gh-112529: Make the GC scheduling thread-safe The GC keeps track of the number of allocations (less deallocations) since the last GC. This buffers the count in thread-local state and uses atomic operations to modify the per-interpreter count. The thread-local buffering avoids contention on shared state. A consequence is that the GC scheduling is not as precise, so "test_sneaky_frame_object" is skipped because it requires that the GC be run exactly after allocating a frame object. --- Include/internal/pycore_gc.h | 7 ++++ Include/internal/pycore_tstate.h | 1 + Lib/test/test_frame.py | 3 +- Modules/gcmodule.c | 10 +++++ Objects/typeobject.c | 2 + Python/gc_free_threading.c | 63 ++++++++++++++++++++++++-------- 6 files changed, 70 insertions(+), 16 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index b362a294a59042..f9d2e8ce7595c6 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -243,6 +243,13 @@ struct _gc_runtime_state { Py_ssize_t long_lived_pending; }; +#ifdef Py_GIL_DISABLED +struct _gc_thread_state { + /* Thread-local allocation count. */ + Py_ssize_t alloc_count; +}; +#endif + extern void _PyGC_InitState(struct _gc_runtime_state *); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 472fa08154e8f9..84a3b59d2442e0 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -20,6 +20,7 @@ typedef struct _PyThreadStateImpl { PyThreadState base; #ifdef Py_GIL_DISABLED + struct _gc_thread_state gc; struct _mimalloc_thread_state mimalloc; struct _Py_freelist_state freelist_state; #endif diff --git a/Lib/test/test_frame.py b/Lib/test/test_frame.py index 244ce8af7cdf08..4789c17e07e189 100644 --- a/Lib/test/test_frame.py +++ b/Lib/test/test_frame.py @@ -13,7 +13,7 @@ _testcapi = None from test import support -from test.support import threading_helper +from test.support import threading_helper, Py_GIL_DISABLED from test.support.script_helper import assert_python_ok @@ -295,6 +295,7 @@ def gen(): assert_python_ok("-c", code) @support.cpython_only + @unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling") def test_sneaky_frame_object(self): def trace(frame, event, arg): diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index ffddef34ecce7a..df324b510969de 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module) /*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/ { GCState *gcstate = get_gc_state(); + +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + struct _gc_thread_state *gc = &tstate->gc; + + // Flush the local allocation count to the global count + _Py_atomic_add_int(&gcstate->generations[0].count, gc->alloc_count); + gc->alloc_count = 0; +#endif + return Py_BuildValue("(iii)", gcstate->generations[0].count, gcstate->generations[1].count, diff --git a/Objects/typeobject.c b/Objects/typeobject.c index a850473cad813d..7937f0911f9a61 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -1739,6 +1739,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems) if (presize) { ((PyObject **)alloc)[0] = NULL; ((PyObject **)alloc)[1] = NULL; + } + if (PyType_IS_GC(type)) { _PyObject_GC_Link(obj); } memset(obj, '\0', size); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 53f927bfa65310..647d2e2861371a 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -22,6 +22,11 @@ typedef struct _gc_runtime_state GCState; # define GC_DEBUG #endif +// Each thread buffers the count of allocated objects in a thread-local +// variable up to +/- this amount to reduce the overhead of updating +// the global count. +#define LOCAL_ALLOC_COUNT_THRESHOLD 512 + // Automatically choose the generation that needs collecting. #define GENERATION_AUTO (-1) @@ -923,6 +928,41 @@ gc_should_collect(GCState *gcstate) gcstate->generations[1].threshold == 0); } +static void +record_allocation(PyThreadState *tstate) +{ + struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc; + + // We buffer the allocation count to avoid the overhead of atomic + // operations for every allocation. + gc->alloc_count++; + if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) { + // TODO: Use Py_ssize_t for the generation count. + GCState *gcstate = &tstate->interp->gc; + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); + gc->alloc_count = 0; + + if (gc_should_collect(gcstate) && + !_Py_atomic_load_int_relaxed(&gcstate->collecting)) + { + _Py_ScheduleGC(tstate->interp); + } + } +} + +static void +record_deallocation(PyThreadState *tstate) +{ + struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc; + + gc->alloc_count--; + if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) { + GCState *gcstate = &tstate->interp->gc; + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); + gc->alloc_count = 0; + } +} + static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) { @@ -942,6 +982,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) } } + // Record the number of live GC objects + interp->gc.long_lived_total = state->long_lived_total; + // Clear weakrefs and enqueue callbacks (but do not call them). clear_weakrefs(state); _PyEval_StartTheWorld(interp); @@ -1048,7 +1091,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) m = state.collected; n = state.uncollectable; - gcstate->long_lived_total = state.long_lived_total; if (gcstate->debug & _PyGC_DEBUG_STATS) { double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1); @@ -1488,15 +1530,7 @@ _Py_ScheduleGC(PyInterpreterState *interp) void _PyObject_GC_Link(PyObject *op) { - PyThreadState *tstate = _PyThreadState_GET(); - GCState *gcstate = &tstate->interp->gc; - gcstate->generations[0].count++; - - if (gc_should_collect(gcstate) && - !_Py_atomic_load_int_relaxed(&gcstate->collecting)) - { - _Py_ScheduleGC(tstate->interp); - } + record_allocation(_PyThreadState_GET()); } void @@ -1522,7 +1556,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize) ((PyObject **)mem)[1] = NULL; } PyObject *op = (PyObject *)(mem + presize); - _PyObject_GC_Link(op); + record_allocation(tstate); return op; } @@ -1604,10 +1638,9 @@ PyObject_GC_Del(void *op) PyErr_SetRaisedException(exc); #endif } - GCState *gcstate = get_gc_state(); - if (gcstate->generations[0].count > 0) { - gcstate->generations[0].count--; - } + + record_deallocation(_PyThreadState_GET()); + PyObject_Free(((char *)op)-presize); } From e832dd93dfaa3a2e2a8fbf5a0cad48852d58848a Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 2 Feb 2024 21:11:59 +0000 Subject: [PATCH 2/3] Fix warning --- Modules/gcmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index df324b510969de..d03f5fc18394c5 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -207,7 +207,7 @@ gc_get_count_impl(PyObject *module) struct _gc_thread_state *gc = &tstate->gc; // Flush the local allocation count to the global count - _Py_atomic_add_int(&gcstate->generations[0].count, gc->alloc_count); + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); gc->alloc_count = 0; #endif From 456b778adc35211a9a3f5c773b349396d2b26816 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 6 Feb 2024 23:10:12 +0000 Subject: [PATCH 3/3] Skip test_gc.test_get_count() in builds --- Lib/test/test_gc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py index 0002852fce9643..6d1af7e4d77e6c 100644 --- a/Lib/test/test_gc.py +++ b/Lib/test/test_gc.py @@ -363,6 +363,7 @@ def __del__(self): # To minimize variations, though, we first store the get_count() results # and check them at the end. @refcount_test + @unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts') def test_get_count(self): gc.collect() a, b, c = gc.get_count()