Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-110481: Implement inter-thread queue for biased reference counting #114824

Merged
merged 11 commits into from
Feb 9, 2024
74 changes: 74 additions & 0 deletions Include/internal/pycore_brc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef Py_INTERNAL_BRC_H
#define Py_INTERNAL_BRC_H

#include <stdint.h>
#include "pycore_llist.h" // struct llist_node
#include "pycore_lock.h" // PyMutex
#include "pycore_object_stack.h" // _PyObjectStack

#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#ifdef Py_GIL_DISABLED

// Prime number to avoid correlations with memory addresses.
#define _Py_BRC_NUM_BUCKETS 257

// Hash table bucket
struct _brc_bucket {
// Mutex protects both the bucket and thread state queues in this bucket.
PyMutex mutex;

// Linked list of _PyThreadStateImpl objects hashed to this bucket.
struct llist_node root;
};

// Per-interpreter biased reference counting state
struct _brc_state {
// Hash table of thread states by thread-id. Thread states within a bucket
// are chained using a doubly-linked list.
struct _brc_bucket table[_Py_BRC_NUM_BUCKETS];
};

// Per-thread biased reference counting state
struct _brc_thread_state {
// Linked-list of thread states per hash bucket
struct llist_node bucket_node;

// Thread-id as determined by _PyThread_Id()
uintptr_t tid;

// Objects with refcounts to be merged (protected by bucket mutex)
_PyObjectStack objects_to_merge;

// Local stack of objects to be merged (not accessed by other threads)
_PyObjectStack local_objects_to_merge;
};

// Initialize/finalize the per-thread biased reference counting state
void _Py_brc_init_thread(PyThreadState *tstate);
void _Py_brc_remove_thread(PyThreadState *tstate);

// Initialize per-interpreter state
void _Py_brc_init_state(PyInterpreterState *interp);

void _Py_brc_after_fork(PyInterpreterState *interp);

// Enqueues an object to be merged by it's owning thread (tid). This
// steals a reference to the object.
void _Py_brc_queue_object(PyObject *ob);

// Merge the refcounts of queued objects for the current thread.
void _Py_brc_merge_refcounts(PyThreadState *tstate);

#endif /* Py_GIL_DISABLED */

#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_BRC_H */
1 change: 1 addition & 0 deletions Include/internal/pycore_ceval.h
Original file line number Diff line number Diff line change
@@ -206,6 +206,7 @@ void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame)
#define _PY_ASYNC_EXCEPTION_BIT 3
#define _PY_GC_SCHEDULED_BIT 4
#define _PY_EVAL_PLEASE_STOP_BIT 5
#define _PY_EVAL_EXPLICIT_MERGE_BIT 6

/* Reserve a few bits for future use */
#define _PY_EVAL_EVENTS_BITS 8
1 change: 1 addition & 0 deletions Include/internal/pycore_interp.h
Original file line number Diff line number Diff line change
@@ -201,6 +201,7 @@ struct _is {

#if defined(Py_GIL_DISABLED)
struct _mimalloc_interp_state mimalloc;
struct _brc_state brc; // biased reference counting state
#endif

// Per-interpreter state for the obmalloc allocator. For the main
6 changes: 6 additions & 0 deletions Include/internal/pycore_object_stack.h
Original file line number Diff line number Diff line change
@@ -32,6 +32,8 @@ _PyObjectStackChunk_New(void);
extern void
_PyObjectStackChunk_Free(_PyObjectStackChunk *);

typedef struct _Py_freelist_state _PyFreeListState;

extern void
_PyObjectStackChunk_ClearFreeList(_PyFreeListState *state, int is_finalization);

@@ -74,6 +76,10 @@ _PyObjectStack_Pop(_PyObjectStack *stack)
return obj;
}

// Merge src into dst, leaving src empty
extern void
_PyObjectStack_Merge(_PyObjectStack *dst, _PyObjectStack *src);

// Remove all items from the stack
extern void
_PyObjectStack_Clear(_PyObjectStack *stack);
2 changes: 2 additions & 0 deletions Include/internal/pycore_tstate.h
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@ extern "C" {

#include "pycore_freelist.h" // struct _Py_freelist_state
#include "pycore_mimalloc.h" // struct _mimalloc_thread_state
#include "pycore_brc.h" // struct _brc_thread_state


// Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
@@ -22,6 +23,7 @@ typedef struct _PyThreadStateImpl {
#ifdef Py_GIL_DISABLED
struct _mimalloc_thread_state mimalloc;
struct _Py_freelist_state freelist_state;
struct _brc_thread_state brc;
#endif

} _PyThreadStateImpl;
1 change: 1 addition & 0 deletions Lib/test/test_code.py
Original file line number Diff line number Diff line change
@@ -865,6 +865,7 @@ def __init__(self, f, test):
self.test = test
def run(self):
del self.f
gc_collect()
self.test.assertEqual(LAST_FREED, 500)

SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(500))
17 changes: 15 additions & 2 deletions Lib/test/test_concurrent_futures/executor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import threading
import time
import unittest
import weakref
from concurrent import futures
from test import support
from test.support import Py_GIL_DISABLED


def mul(x, y):
@@ -83,10 +85,21 @@ def test_no_stale_references(self):
my_object_collected = threading.Event()
my_object_callback = weakref.ref(
my_object, lambda obj: my_object_collected.set())
# Deliberately discarding the future.
self.executor.submit(my_object.my_method)
fut = self.executor.submit(my_object.my_method)
del my_object

if Py_GIL_DISABLED:
# Due to biased reference counting, my_object might only be
# deallocated while the thread that created it runs -- if the
# thread is paused waiting on an event, it may not merge the
# refcount of the queued object. For that reason, we wait for the
# task to finish (so that it's no longer referenced) and force a
# GC to ensure that it is collected.
fut.result() # Wait for the task to finish.
support.gc_collect()
else:
del fut # Deliberately discard the future.

collected = my_object_collected.wait(timeout=support.SHORT_TIMEOUT)
self.assertTrue(collected,
"Stale reference not collected within timeout.")
1 change: 1 addition & 0 deletions Lib/test/test_concurrent_futures/test_process_pool.py
Original file line number Diff line number Diff line change
@@ -98,6 +98,7 @@ def test_ressources_gced_in_workers(self):

# explicitly destroy the object to ensure that EventfulGCObj.__del__()
# is called while manager is still running.
support.gc_collect()
obj = None
support.gc_collect()

2 changes: 2 additions & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
@@ -405,6 +405,7 @@ PYTHON_OBJS= \
Python/ast_opt.o \
Python/ast_unparse.o \
Python/bltinmodule.o \
Python/brc.o \
Python/ceval.o \
Python/codecs.o \
Python/compile.o \
@@ -1808,6 +1809,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_ast_state.h \
$(srcdir)/Include/internal/pycore_atexit.h \
$(srcdir)/Include/internal/pycore_bitutils.h \
$(srcdir)/Include/internal/pycore_brc.h \
$(srcdir)/Include/internal/pycore_bytes_methods.h \
$(srcdir)/Include/internal/pycore_bytesobject.h \
$(srcdir)/Include/internal/pycore_call.h \
4 changes: 4 additions & 0 deletions Modules/posixmodule.c
Original file line number Diff line number Diff line change
@@ -647,6 +647,10 @@ PyOS_AfterFork_Child(void)
goto fatal_error;
}

#ifdef Py_GIL_DISABLED
_Py_brc_after_fork(tstate->interp);
#endif

_PySignal_AfterFork();

status = _PyInterpreterState_DeleteExceptMain(runtime);
3 changes: 2 additions & 1 deletion Objects/dictobject.c
Original file line number Diff line number Diff line change
@@ -5581,7 +5581,8 @@ _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv)
// Don't try this at home, kids:
dict->ma_keys = NULL;
dict->ma_values = NULL;
Py_DECREF(dict);
Py_SET_REFCNT(dict, 0);
_Py_Dealloc((PyObject *)dict);
return true;
}

8 changes: 2 additions & 6 deletions Objects/object.c
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
/* Generic object operations; and implementation of None */

#include "Python.h"
#include "pycore_brc.h" // _Py_brc_queue_object()
#include "pycore_call.h" // _PyObject_CallNoArgs()
#include "pycore_ceval.h" // _Py_EnterRecursiveCallTstate()
#include "pycore_context.h" // _PyContextTokenMissing_Type
@@ -344,12 +345,7 @@ _Py_DecRefSharedDebug(PyObject *o, const char *filename, int lineno)
&shared, new_shared));

if (should_queue) {
// TODO: the inter-thread queue is not yet implemented. For now,
// we just merge the refcount here.
Py_ssize_t refcount = _Py_ExplicitMergeRefcount(o, -1);
if (refcount == 0) {
_Py_Dealloc(o);
}
_Py_brc_queue_object(o);
}
else if (new_shared == _Py_REF_MERGED) {
// refcount is zero AND merged
1 change: 1 addition & 0 deletions PCbuild/_freeze_module.vcxproj
Original file line number Diff line number Diff line change
@@ -191,6 +191,7 @@
<ClCompile Include="..\Python\ast_opt.c" />
<ClCompile Include="..\Python\ast_unparse.c" />
<ClCompile Include="..\Python\bltinmodule.c" />
<ClCompile Include="..\Python\brc.c" />
<ClCompile Include="..\Python\bootstrap_hash.c" />
<ClCompile Include="..\Python\ceval.c" />
<ClCompile Include="..\Python\codecs.c" />
3 changes: 3 additions & 0 deletions PCbuild/_freeze_module.vcxproj.filters
Original file line number Diff line number Diff line change
@@ -46,6 +46,9 @@
<ClCompile Include="..\Python\bltinmodule.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Python\brc.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Objects\boolobject.c">
<Filter>Source Files</Filter>
</ClCompile>
2 changes: 2 additions & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
@@ -206,6 +206,7 @@
<ClInclude Include="..\Include\internal\pycore_ast_state.h" />
<ClInclude Include="..\Include\internal\pycore_atexit.h" />
<ClInclude Include="..\Include\internal\pycore_bitutils.h" />
<ClInclude Include="..\Include\internal\pycore_brc.h" />
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
<ClInclude Include="..\Include\internal\pycore_bytesobject.h" />
<ClInclude Include="..\Include\internal\pycore_call.h" />
@@ -553,6 +554,7 @@
<ClCompile Include="..\Python\ast_unparse.c" />
<ClCompile Include="..\Python\bltinmodule.c" />
<ClCompile Include="..\Python\bootstrap_hash.c" />
<ClCompile Include="..\Python\brc.c" />
<ClCompile Include="..\Python\ceval.c" />
<ClCompile Include="..\Python\codecs.c" />
<ClCompile Include="..\Python\compile.c" />
6 changes: 6 additions & 0 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
@@ -546,6 +546,9 @@
<ClInclude Include="..\Include\internal\pycore_bitutils.h">
<Filter>Include\internal</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_brc.h">
<Filter>Include\internal</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
<Filter>Include\internal</Filter>
</ClInclude>
@@ -1253,6 +1256,9 @@
<ClCompile Include="..\Python\bltinmodule.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\brc.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\ceval.c">
<Filter>Python</Filter>
</ClCompile>
Loading