Use thread_local for loader_life_support to improve performance

swolchok · swolchok · commit a91264859827 · 2025-09-05T10:20:07.000-07:00
As explained in a new code comment, `loader_life_support` needs to be `thread_local` but does not need to be isolated to a particular interpreter because any given function call is already going to only happen on a single interpreter by definiton. Performance before: - on M4 Max using pybind/pybind11_benchmark unmodified repo: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 63.8 nsec per loop ``` - Linux server: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 120 nsec per loop ``` After: - M4 Max: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 53.1 nsec per loop ``` - Linux server: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 101 nsec per loop ``` A quick profile with perf shows that pthread_setspecific and pthread_getspecific are gone. Open questions: - How do we determine whether we can safely use `thread_local`? I see concerns about old iOS versions on #5705 (comment) and #5709; is there anything else? - Do we have a test that covers "function called in one interpreter calls a C++ function that causes a function call in another interpreter"? I think it's fine, but can it happen? - Are we happy with what we think will happen in the case where multiple extensions compiled with and without this PR interoperate? I think it's fine -- each dispatch pushes and cleans up its own state -- but a second opinion is certainly welcome.
diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h
@@ -1344,5 +1344,8 @@ constexpr
 #    define PYBIND11_BACKWARD_COMPATIBILITY_TP_DICTOFFSET
 #endif
 
+// TODO: determine which platforms cannot use thread_local.
+#define PYBIND11_CAN_USE_THREAD_LOCAL 1
+
 PYBIND11_NAMESPACE_END(detail)
 PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h
@@ -45,17 +45,48 @@ class loader_life_support {
     loader_life_support *parent = nullptr;
     std::unordered_set<PyObject *> keep_alive;
 
+#if defined(PYBIND11_CAN_USE_THREAD_LOCAL)
+    struct fake_thread_specific_storage {
+        loader_life_support *instance = nullptr;
+        loader_life_support *get() const { return instance; }
+
+        fake_thread_specific_storage &operator=(loader_life_support *pval) {
+            instance = pval;
+            return *this;
+        }
+    };
+    using loader_storage = fake_thread_specific_storage;
+#else
+    using loader_storage = thread_specific_storage<loader_life_support>;
+#endif
+
+    static loader_storage &get_stack_top() {
+#if defined(PYBIND11_CAN_USE_THREAD_LOCAL)
+        // Without this branch, loader_life_support destruction is a
+        // significant cost per function call.
+        //
+        // Observation: loader_life_support needs to be thread-local, but
+        // we don't need to go to extra effort to keep it per-interpreter
+        // (i.e., by putting it in internals) since function calls are
+        // already isolated to a single interpreter.
+        static thread_local fake_thread_specific_storage storage;
+        return storage;
+#else
+        return get_internals().loader_life_support_tls;
+#endif
+    }
+
 public:
     /// A new patient frame is created when a function is entered
     loader_life_support() {
-        auto &stack_top = get_internals().loader_life_support_tls;
+        auto &stack_top = get_stack_top();
         parent = stack_top.get();
         stack_top = this;
     }
 
     /// ... and destroyed after it returns
     ~loader_life_support() {
-        auto &stack_top = get_internals().loader_life_support_tls;
+        auto &stack_top = get_stack_top();
         if (stack_top.get() != this) {
             pybind11_fail("loader_life_support: internal error");
         }
@@ -68,7 +99,7 @@ class loader_life_support {
     /// This can only be used inside a pybind11-bound function, either by `argument_loader`
     /// at argument preparation time or by `py::cast()` at execution time.
     PYBIND11_NOINLINE static void add_patient(handle h) {
-        loader_life_support *frame = get_internals().loader_life_support_tls.get();
+        loader_life_support *frame = get_stack_top().get();
         if (!frame) {
             // NOTE: It would be nice to include the stack frames here, as this indicates
             // use of pybind11::cast<> outside the normal call framework, finding such