pythongh-115103: Implement delayed memory reclamation (QSBR)

colesbury · colesbury · commit cbfc9a581c26 · 2024-02-07T17:12:34.000Z
diff --git a/Doc/license.rst b/Doc/license.rst
@@ -1095,3 +1095,35 @@ which is distributed under the MIT license::
   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+Global Unbounded Sequences (GUS)
+--------------------------------
+
+The file :file:`Python/qsbr.c` is adapted from FreeBSD's "Global Unbounded
+Sequences" safe memory reclamation scheme in
+`subr_smr.c <https://github.com/freebsd/freebsd-src/blob/main/sys/kern/subr_smr.c>`_.
+The file is distributed under the 2-Clause BSD License::
+
+  Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  1. Redistributions of source code must retain the above copyright
+     notice unmodified, this list of conditions, and the following
+     disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
@@ -31,6 +31,7 @@ extern "C" {
 #include "pycore_mimalloc.h"      // struct _mimalloc_interp_state
 #include "pycore_object_state.h"  // struct _py_object_state
 #include "pycore_obmalloc.h"      // struct _obmalloc_state
+#include "pycore_qsbr.h"          // struct _qsbr_state
 #include "pycore_tstate.h"        // _PyThreadStateImpl
 #include "pycore_tuple.h"         // struct _Py_tuple_state
 #include "pycore_typeobject.h"    // struct types_state
@@ -198,6 +199,7 @@ struct _is {
     struct _warnings_runtime_state warnings;
     struct atexit_state atexit;
     struct _stoptheworld_state stoptheworld;
+    struct _qsbr_shared qsbr;
 
 #if defined(Py_GIL_DISABLED)
     struct _mimalloc_interp_state mimalloc;
diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h
@@ -0,0 +1,117 @@
+#ifndef Py_INTERNAL_QSBR_H
+#define Py_INTERNAL_QSBR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "pycore_lock.h"        // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _qsbr_shared;
+struct _PyThreadStateImpl;  // forward declare to avoid circular dependency
+
+// Per-thread state
+struct _qsbr_thread_state {
+    // Last observed write sequence (or 0 if detached)
+    uint64_t seq;
+
+    // Shared (per-interpreter) QSBR state
+    struct _qsbr_shared *shared;
+
+    // Thread state (or NULL)
+    PyThreadState *tstate;
+
+    // Used to defer advancing write sequence a fixed number of times
+    int deferrals;
+
+    // Is this thread state allocated?
+    bool allocated;
+    struct _qsbr_thread_state *freelist_next;
+};
+
+// Padding to avoid false sharing
+struct _qsbr_pad {
+    struct _qsbr_thread_state qsbr;
+    char __padding[64 - sizeof(struct _qsbr_thread_state)];
+};
+
+// Per-interpreter state
+struct _qsbr_shared {
+    // Always odd, incremented by two
+    uint64_t wr_seq;
+
+    // Minimum observed read sequence
+    uint64_t rd_seq;
+
+    // Array of QSBR thread states.
+    struct _qsbr_pad *array;
+    Py_ssize_t size;
+
+    // Freelist of unused _qsbr_thread_states (protected by mutex)
+    PyMutex mutex;
+    struct _qsbr_thread_state *freelist;
+};
+
+static inline uint64_t
+_Py_qsbr_shared_current(struct _qsbr_shared *shared)
+{
+    return _Py_atomic_load_uint64(&shared->wr_seq);  // at least acquire
+}
+
+static inline void
+_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr)
+{
+    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+    _Py_atomic_store_uint64_relaxed(&qsbr->seq, seq);  // probably release
+}
+
+// Advance the write sequence and return the new goal.
+extern uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared);
+
+// Batches requests to advance the write sequence. This advances the write
+// sequence every N calls. Returns the new goal.
+extern uint64_t
+_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
+
+// Have the read sequences advanced to the given goal?
+extern bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal);
+
+// Called when thread attaches to interpreter
+extern void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr);
+
+// Called when thread detaches from interpreter
+extern void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr);
+
+// Reserves (allocates) a QSBR state and returns its index
+extern Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp);
+
+// Associates a PyThreadState with the QSBR state at the given index
+extern void
+_Py_qsbr_register(struct _PyThreadStateImpl *tstate,
+                  PyInterpreterState *interp, Py_ssize_t index);
+
+// Disassociates a PyThreadState from the QSBR state and frees the QSBR state.
+extern void
+_Py_qsbr_unregister(struct _PyThreadStateImpl *tstate);
+
+extern void
+_Py_qsbr_fini(PyInterpreterState *interp);
+
+extern void
+_Py_qsbr_after_fork(struct _qsbr_shared *shared, struct _qsbr_thread_state *qsbr);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_QSBR_H */
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
@@ -169,6 +169,10 @@ extern PyTypeObject _PyExc_MemoryError;
                 { .threshold = 10, }, \
             }, \
         }, \
+        .qsbr = { \
+            .wr_seq = 1, \
+            .rd_seq = 1, \
+        }, \
         .object_state = _py_object_state_INIT(INTERP), \
         .dtoa = _dtoa_state_INIT(&(INTERP)), \
         .dict_state = _dict_state_INIT, \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
@@ -10,6 +10,7 @@ extern "C" {
 
 #include "pycore_freelist.h"      // struct _Py_freelist_state
 #include "pycore_mimalloc.h"      // struct _mimalloc_thread_state
+#include "pycore_qsbr.h"          // struct qsbr
 
 
 // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
@@ -20,6 +21,7 @@ typedef struct _PyThreadStateImpl {
     PyThreadState base;
 
 #ifdef Py_GIL_DISABLED
+    struct _qsbr_thread_state *qsbr;
     struct _mimalloc_thread_state mimalloc;
     struct _Py_freelist_state freelist_state;
 #endif
diff --git a/Makefile.pre.in b/Makefile.pre.in
@@ -455,6 +455,7 @@ PYTHON_OBJS=	\
 		Python/pystate.o \
 		Python/pythonrun.o \
 		Python/pytime.o \
+		Python/qsbr.o \
 		Python/bootstrap_hash.o \
 		Python/specialize.o \
 		Python/structmember.o \
@@ -1158,6 +1159,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/internal/pycore_pystats.h \
 		$(srcdir)/Include/internal/pycore_pythonrun.h \
 		$(srcdir)/Include/internal/pycore_pythread.h \
+		$(srcdir)/Include/internal/pycore_qsbr.h \
 		$(srcdir)/Include/internal/pycore_range.h \
 		$(srcdir)/Include/internal/pycore_runtime.h \
 		$(srcdir)/Include/internal/pycore_runtime_init.h \
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
@@ -252,6 +252,7 @@
     <ClCompile Include="..\Python\pythonrun.c" />
     <ClCompile Include="..\Python\Python-tokenize.c" />
     <ClCompile Include="..\Python\pytime.c" />
+    <ClCompile Include="..\Python\qsbr.c" />
     <ClCompile Include="..\Python\specialize.c" />
     <ClCompile Include="..\Python\structmember.c" />
     <ClCompile Include="..\Python\suggestions.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
@@ -373,6 +373,9 @@
     <ClCompile Include="..\Python\pytime.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\qsbr.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\rangeobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
@@ -274,6 +274,7 @@
     <ClInclude Include="..\Include\internal\pycore_pystats.h" />
     <ClInclude Include="..\Include\internal\pycore_pythonrun.h" />
     <ClInclude Include="..\Include\internal\pycore_pythread.h" />
+    <ClInclude Include="..\Include\internal\pycore_qsbr.h" />
     <ClInclude Include="..\Include\internal\pycore_range.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime_init.h" />
@@ -611,6 +612,7 @@
     <ClCompile Include="..\Python\pystrcmp.c" />
     <ClCompile Include="..\Python\pystrhex.c" />
     <ClCompile Include="..\Python\pystrtod.c" />
+    <ClCompile Include="..\Python\qsbr.c" />
     <ClCompile Include="..\Python\dtoa.c" />
     <ClCompile Include="..\Python\Python-ast.c" />
     <ClCompile Include="..\Python\Python-tokenize.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
@@ -747,6 +747,9 @@
     <ClInclude Include="..\Include\internal\pycore_pythread.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_qsbr.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_range.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
@@ -1412,6 +1415,9 @@
     <ClCompile Include="..\Python\pystrtod.c">
       <Filter>Python</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\qsbr.c">
+      <Filter>Python</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\dtoa.c">
       <Filter>Python</Filter>
     </ClCompile>
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
@@ -86,6 +86,12 @@
 #define PRE_DISPATCH_GOTO() ((void)0)
 #endif
 
+#ifdef Py_GIL_DISABLED
+#define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
+#else
+#define QSBR_QUIESCENT_STATE(tstate)
+#endif
+
 
 /* Do interpreter dispatch accounting for tracing and instrumentation */
 #define DISPATCH() \
@@ -117,6 +123,7 @@
 
 #define CHECK_EVAL_BREAKER() \
     _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
+    QSBR_QUIESCENT_STATE(tstate); \
     if (_Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
         if (_Py_HandlePending(tstate) != 0) { \
             GOTO_ERROR(error); \
diff --git a/Python/pystate.c b/Python/pystate.c
@@ -951,6 +951,8 @@ PyInterpreterState_Delete(PyInterpreterState *interp)
         PyThread_free_lock(interp->id_mutex);
     }
 
+    _Py_qsbr_fini(interp);
+
     _PyObject_FiniState(interp);
 
     free_interpreter(interp);
@@ -1372,6 +1374,14 @@ new_threadstate(PyInterpreterState *interp, int whence)
     if (new_tstate == NULL) {
         return NULL;
     }
+#ifdef Py_GIL_DISABLED
+    Py_ssize_t qsbr_idx = _Py_qsbr_reserve(interp);
+    if (qsbr_idx < 0) {
+        PyMem_RawFree(new_tstate);
+        return NULL;
+    }
+#endif
+
     /* We serialize concurrent creation to protect global state. */
     HEAD_LOCK(runtime);
 
@@ -1398,6 +1408,9 @@ new_threadstate(PyInterpreterState *interp, int whence)
                sizeof(*tstate));
     }
 
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_register(tstate, interp, qsbr_idx);
+#endif
     init_threadstate(tstate, interp, id, whence);
     add_threadstate(interp, (PyThreadState *)tstate, old_head);
 
@@ -1609,6 +1622,10 @@ tstate_delete_common(PyThreadState *tstate)
     }
     HEAD_UNLOCK(runtime);
 
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_unregister((_PyThreadStateImpl *)tstate);
+#endif
+
     // XXX Unbind in PyThreadState_Clear(), or earlier
     // (and assert not-equal here)?
     if (tstate->_status.bound_gilstate) {
@@ -1650,6 +1667,9 @@ void
 _PyThreadState_DeleteCurrent(PyThreadState *tstate)
 {
     _Py_EnsureTstateNotNULL(tstate);
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
     tstate_set_detached(tstate);
     tstate_delete_common(tstate);
     current_fast_clear(tstate->interp->runtime);
@@ -1871,6 +1891,10 @@ _PyThreadState_Attach(PyThreadState *tstate)
         tstate_wait_attach(tstate);
     }
 
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_attach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
+
     // Resume previous critical section. This acquires the lock(s) from the
     // top-most critical section.
     if (tstate->critical_section != 0) {
@@ -1891,6 +1915,9 @@ detach_thread(PyThreadState *tstate, int detached_state)
     if (tstate->critical_section != 0) {
         _PyCriticalSection_SuspendAll(tstate);
     }
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
     tstate_deactivate(tstate);
     tstate_set_detached(tstate);
     current_fast_clear(&_PyRuntime);
diff --git a/Python/qsbr.c b/Python/qsbr.c

-Original file line number
+Diff line change
 +/*
 + * Implementation of safe memory reclamation scheme using
 + * quiescent states.
 + *
 + * This is dervied from the "GUS" safe memory reclamation technique
 + * in FreeBSD written by Jeffrey Roberson. It is heavily modified. Any bugs
 + * in this code are likely due to the modifications.
 + *
 + * The original copyright is preserved below.
 + *
 + * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice unmodified, this list of conditions, and the following
 + *    disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 + */
 +#include "Python.h"
 +#include "pycore_initconfig.h"      // _PyStatus_NO_MEMORY()
 +#include "pycore_lock.h"            // PyMutex_Lock()
 +#include "pycore_qsbr.h"
 +#include "pycore_pystate.h"         // _PyThreadState_GET()
++
++
 +// Wrap-around safe comparison
 +#define QSBR_LT(a, b) ((int64_t)((a)-(b)) < 0)
 +#define QSBR_LEQ(a, b) ((int64_t)((a)-(b)) <= 0)
++
 +// Starting size of the array of qsbr thread states
 +#define MIN_ARRAY_SIZE 8
++
 +// The shared write sequence is always odd and incremented by two. Detached
 +// threads are indicated by a read sequence of zero.
 +#define QSBR_OFFLINE 0
 +#define QSBR_INITIAL 1
 +#define QSBR_INCR    2
++
 +// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
 +// the write sequence.
 +#define QSBR_DEFERRED_LIMIT 10
++
 +// Allocate a QSBR thread state from the freelist
 +struct _qsbr_thread_state *
 +qsbr_allocate(struct _qsbr_shared *shared)
 +{
 +    struct _qsbr_thread_state *qsbr = shared->freelist;
 +    if (qsbr == NULL) {
 +        return NULL;
 +    }
 +    shared->freelist = qsbr->freelist_next;
 +    qsbr->freelist_next = NULL;
 +    qsbr->shared = shared;
 +    qsbr->allocated = true;
 +    return qsbr;
 +}
++
 +// Initialize (or reintialize) the freelist of QSBR thread states
 +static void
 +initialize_freelist(struct _qsbr_shared *shared)
 +{
 +    for (Py_ssize_t i = 0; i != shared->size; i++) {
 +        struct _qsbr_thread_state *qsbr = &shared->array[i].qsbr;
 +        if (qsbr->tstate != NULL) {
 +            // Update the thread state pointer to its QSBR state
 +            _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)qsbr->tstate;
 +            tstate->qsbr = qsbr;
 +        }
 +        if (!qsbr->allocated) {
 +            // Push to freelist
 +            qsbr->freelist_next = shared->freelist;
 +            shared->freelist = qsbr;
 +        }
 +    }
 +}
++
 +// Grow the array of QSBR thread states. Returns 0 on success, -1 on failure.
 +static int
 +grow_thread_array(struct _qsbr_shared *shared)
 +{
 +    Py_ssize_t new_size = shared->size * 2;
 +    if (new_size < MIN_ARRAY_SIZE) {
 +        new_size = MIN_ARRAY_SIZE;
 +    }
++
 +    struct _qsbr_pad *array = PyMem_RawCalloc(new_size, sizeof(*array));
 +    if (array == NULL) {
 +        return -1;
 +    }
++
 +    struct _qsbr_pad *old = shared->array;
 +    if (old != NULL) {
 +        memcpy(array, shared->array, shared->size * sizeof(*array));
 +    }
++
 +    shared->array = array;
 +    shared->size = new_size;
 +    shared->freelist = NULL;
 +    initialize_freelist(shared);
++
 +    PyMem_RawFree(old);
 +    return 0;
 +}
++
 +uint64_t
 +_Py_qsbr_advance(struct _qsbr_shared *shared)
 +{
 +    return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR;
 +}
++
 +uint64_t
 +_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
 +{
 +	if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
 +        return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
 +    }
 +    qsbr->deferrals = 0;
 +    return _Py_qsbr_advance(qsbr->shared);
 +}
++
 +static uint64_t
 +qsbr_poll_scan(struct _qsbr_shared *shared)
 +{
 +    // Compute the minimum sequence number of all attached threads
 +    uint64_t min_seq = _Py_atomic_load_uint64(&shared->wr_seq);
 +    struct _qsbr_pad *array = shared->array;
 +    for (Py_ssize_t i = 0, size = shared->size; i != size; i++) {
 +        struct _qsbr_thread_state *qsbr = &array[i].qsbr;
++
 +        uint64_t seq = _Py_atomic_load_uint64(&qsbr->seq);
 +        if (seq != QSBR_OFFLINE && QSBR_LT(seq, min_seq)) {
 +            min_seq = seq;
 +        }
 +    }
++
 +    // Update the shared read sequence
 +    uint64_t rd_seq = _Py_atomic_load_uint64(&shared->rd_seq);
 +    if (QSBR_LT(rd_seq, min_seq)) {
 +        // It's okay if the compare-exchange failed: another thread updated it
 +        (void)_Py_atomic_compare_exchange_uint64(&shared->rd_seq, &rd_seq, min_seq);
 +        rd_seq = min_seq;
 +    }
++
 +    return rd_seq;
 +}
++
 +bool
 +_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal)
 +{
 +    assert(_PyThreadState_GET()->state == _Py_THREAD_ATTACHED);
++
 +    uint64_t rd_seq = _Py_atomic_load_uint64(&qsbr->shared->rd_seq);
 +    if (QSBR_LEQ(goal, rd_seq)) {
 +        return true;
 +    }
++
 +    rd_seq = qsbr_poll_scan(qsbr->shared);
 +    return QSBR_LEQ(goal, rd_seq);
 +}
++
 +void
 +_Py_qsbr_attach(struct _qsbr_thread_state *qsbr)
 +{
 +    assert(qsbr->seq == 0 && "already attached");
++
 +    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
 +    _Py_atomic_store_uint64_relaxed(&qsbr->seq, seq);
++
 +    // ensure update to local counter is visible
 +    _Py_atomic_fence_seq_cst();
 +}
++
 +void
 +_Py_qsbr_detach(struct _qsbr_thread_state *qsbr)
 +{
 +    assert(qsbr->seq != 0 && "already detached");
++
 +    _Py_atomic_fence_release();
 +    _Py_atomic_store_uint64_relaxed(&qsbr->seq, QSBR_OFFLINE);
 +}
++
 +Py_ssize_t
 +_Py_qsbr_reserve(PyInterpreterState *interp)
 +{
 +    struct _qsbr_shared *shared = &interp->qsbr;
++
 +    PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
 +    struct _qsbr_thread_state *qsbr = qsbr_allocate(shared);
++
 +    if (qsbr == NULL) {
 +        _PyEval_StopTheWorld(interp);
 +        if (grow_thread_array(shared) == 0) {
 +            qsbr = qsbr_allocate(shared);
 +        }
 +        _PyEval_StartTheWorld(interp);
 +    }
 +    PyMutex_Unlock(&shared->mutex);
++
 +    if (qsbr == NULL) {
 +        return -1;
 +    }
++
 +    // Compute index in the shared array from the pointer
 +    return (struct _qsbr_pad *)qsbr - shared->array;
 +}
++
 +void
 +_Py_qsbr_register(_PyThreadStateImpl *tstate, PyInterpreterState *interp,
 +                  Py_ssize_t index)
 +{
 +    struct _qsbr_shared *shared = &interp->qsbr;
++
 +    // NOTE: this function is called with runtime locked, so we don't detach
 +    // while waiting for the lock. This prevents a stop-the-world pause
 +    // while the runtime lock is held, which could lead to deadlock.
 +    PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
 +    struct _qsbr_thread_state *qsbr = &interp->qsbr.array[index].qsbr;
 +    assert(qsbr->allocated);
 +    assert(qsbr->tstate == NULL);
 +    qsbr->tstate = (PyThreadState *)tstate;
 +    tstate->qsbr = qsbr;
 +    PyMutex_Unlock(&shared->mutex);
 +}
++
 +void
 +_Py_qsbr_unregister(_PyThreadStateImpl *tstate)
 +{
 +    struct _qsbr_thread_state *qsbr = tstate->qsbr;
 +    struct _qsbr_shared *shared = qsbr->shared;
++
 +    assert(qsbr->seq == 0 && "thread state must be detached");
++
 +    PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
 +    qsbr->tstate = NULL;
 +    qsbr->allocated = false;
 +    qsbr->freelist_next = shared->freelist;
 +    shared->freelist = qsbr;
 +    PyMutex_Unlock(&shared->mutex);
 +}
++
 +void
 +_Py_qsbr_fini(PyInterpreterState *interp)
 +{
 +    struct _qsbr_shared *shared = &interp->qsbr;
 +    PyMem_RawFree(shared->array);
 +    shared->array = NULL;
 +    shared->size = 0;
 +    shared->freelist = NULL;
 +}
++
 +void
 +_Py_qsbr_after_fork(struct _qsbr_shared *shared, struct _qsbr_thread_state *this_qsbr)
 +{
 +    _PyMutex_at_fork_reinit(&shared->mutex);
++
 +    for (Py_ssize_t i = 0; i != shared->size; i++) {
 +        struct _qsbr_thread_state *qsbr = &shared->array[i].qsbr;
 +        if (qsbr != this_qsbr && qsbr->tstate != NULL) {
 +            qsbr->tstate = NULL;
 +            qsbr->allocated = false;
 +            qsbr->freelist_next = shared->freelist;
 +            shared->freelist = qsbr;
 +        }
 +    }
 +}