@@ -23,6 +23,11 @@ typedef struct _gc_runtime_state GCState;
23
23
# define GC_DEBUG
24
24
#endif
25
25
26
+ // Each thread buffers the count of allocated objects in a thread-local
27
+ // variable up to +/- this amount to reduce the overhead of updating
28
+ // the global count.
29
+ #define LOCAL_ALLOC_COUNT_THRESHOLD 512
30
+
26
31
// Automatically choose the generation that needs collecting.
27
32
#define GENERATION_AUTO (-1)
28
33
@@ -959,6 +964,41 @@ gc_should_collect(GCState *gcstate)
959
964
gcstate -> generations [1 ].threshold == 0 );
960
965
}
961
966
967
+ static void
968
+ record_allocation (PyThreadState * tstate )
969
+ {
970
+ struct _gc_thread_state * gc = & ((_PyThreadStateImpl * )tstate )-> gc ;
971
+
972
+ // We buffer the allocation count to avoid the overhead of atomic
973
+ // operations for every allocation.
974
+ gc -> alloc_count ++ ;
975
+ if (gc -> alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD ) {
976
+ // TODO: Use Py_ssize_t for the generation count.
977
+ GCState * gcstate = & tstate -> interp -> gc ;
978
+ _Py_atomic_add_int (& gcstate -> generations [0 ].count , (int )gc -> alloc_count );
979
+ gc -> alloc_count = 0 ;
980
+
981
+ if (gc_should_collect (gcstate ) &&
982
+ !_Py_atomic_load_int_relaxed (& gcstate -> collecting ))
983
+ {
984
+ _Py_ScheduleGC (tstate -> interp );
985
+ }
986
+ }
987
+ }
988
+
989
+ static void
990
+ record_deallocation (PyThreadState * tstate )
991
+ {
992
+ struct _gc_thread_state * gc = & ((_PyThreadStateImpl * )tstate )-> gc ;
993
+
994
+ gc -> alloc_count -- ;
995
+ if (gc -> alloc_count <= - LOCAL_ALLOC_COUNT_THRESHOLD ) {
996
+ GCState * gcstate = & tstate -> interp -> gc ;
997
+ _Py_atomic_add_int (& gcstate -> generations [0 ].count , (int )gc -> alloc_count );
998
+ gc -> alloc_count = 0 ;
999
+ }
1000
+ }
1001
+
962
1002
static void
963
1003
gc_collect_internal (PyInterpreterState * interp , struct collection_state * state )
964
1004
{
@@ -981,6 +1021,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
981
1021
}
982
1022
}
983
1023
1024
+ // Record the number of live GC objects
1025
+ interp -> gc .long_lived_total = state -> long_lived_total ;
1026
+
984
1027
// Clear weakrefs and enqueue callbacks (but do not call them).
985
1028
clear_weakrefs (state );
986
1029
_PyEval_StartTheWorld (interp );
@@ -1090,7 +1133,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
1090
1133
1091
1134
m = state .collected ;
1092
1135
n = state .uncollectable ;
1093
- gcstate -> long_lived_total = state .long_lived_total ;
1094
1136
1095
1137
if (gcstate -> debug & _PyGC_DEBUG_STATS ) {
1096
1138
double d = _PyTime_AsSecondsDouble (_PyTime_GetPerfCounter () - t1 );
@@ -1530,15 +1572,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
1530
1572
void
1531
1573
_PyObject_GC_Link (PyObject * op )
1532
1574
{
1533
- PyThreadState * tstate = _PyThreadState_GET ();
1534
- GCState * gcstate = & tstate -> interp -> gc ;
1535
- gcstate -> generations [0 ].count ++ ;
1536
-
1537
- if (gc_should_collect (gcstate ) &&
1538
- !_Py_atomic_load_int_relaxed (& gcstate -> collecting ))
1539
- {
1540
- _Py_ScheduleGC (tstate -> interp );
1541
- }
1575
+ record_allocation (_PyThreadState_GET ());
1542
1576
}
1543
1577
1544
1578
void
@@ -1564,7 +1598,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
1564
1598
((PyObject * * )mem )[1 ] = NULL ;
1565
1599
}
1566
1600
PyObject * op = (PyObject * )(mem + presize );
1567
- _PyObject_GC_Link ( op );
1601
+ record_allocation ( tstate );
1568
1602
return op ;
1569
1603
}
1570
1604
@@ -1646,10 +1680,9 @@ PyObject_GC_Del(void *op)
1646
1680
PyErr_SetRaisedException (exc );
1647
1681
#endif
1648
1682
}
1649
- GCState * gcstate = get_gc_state ();
1650
- if (gcstate -> generations [0 ].count > 0 ) {
1651
- gcstate -> generations [0 ].count -- ;
1652
- }
1683
+
1684
+ record_deallocation (_PyThreadState_GET ());
1685
+
1653
1686
PyObject_Free (((char * )op )- presize );
1654
1687
}
1655
1688
0 commit comments