-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NativeAOT] A few scalability fixes for the fat Lock #88633
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,13 +25,7 @@ private static void MonitorEnter(object obj, ref bool lockTaken) | |
ObjectHeader.GetLockObject(obj) : | ||
SyncTable.GetLockObject(resultOrIndex); | ||
|
||
if (lck.TryAcquire(0)) | ||
{ | ||
lockTaken = true; | ||
return; | ||
} | ||
|
||
Monitor.TryAcquireContended(lck, obj, Timeout.Infinite); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've renamed |
||
Monitor.TryAcquireSlow(lck, obj, Timeout.Infinite); | ||
lockTaken = true; | ||
} | ||
private static void MonitorExit(object obj, ref bool lockTaken) | ||
|
@@ -59,13 +53,7 @@ private static unsafe void MonitorEnterStatic(MethodTable* pMT, ref bool lockTak | |
ObjectHeader.GetLockObject(obj) : | ||
SyncTable.GetLockObject(resultOrIndex); | ||
|
||
if (lck.TryAcquire(0)) | ||
{ | ||
lockTaken = true; | ||
return; | ||
} | ||
|
||
Monitor.TryAcquireContended(lck, obj, Timeout.Infinite); | ||
Monitor.TryAcquireSlow(lck, obj, Timeout.Infinite); | ||
lockTaken = true; | ||
} | ||
private static unsafe void MonitorExitStatic(MethodTable* pMT, ref bool lockTaken) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,10 +16,20 @@ public sealed class Lock : IDisposable | |
// before going to sleep. The amount of spinning is dynamically adjusted based on past | ||
// history of the lock and will stay in the following range. | ||
// | ||
private const uint MaxSpinLimit = 200; | ||
private const uint MinSpinLimit = 10; | ||
private const uint SpinningNotInitialized = MaxSpinLimit + 1; | ||
private const uint SpinningDisabled = 0; | ||
// We use doubling-up delays with a cap while spinning (1,2,4,8,16,32,64,64,64,64, ...) | ||
// Thus 20 iterations is about 1000 speenwaits (20-50 ns each) | ||
// Context switch costs may vary and typically in 2-20 usec range | ||
// Even if we are the only thread trying to acquire the lock at 20-50 usec the cost of being | ||
// blocked+awaken may not be more than 2x of what we have already spent, so that is the max CPU time | ||
// that we will allow to burn while spinning. | ||
// | ||
// This may not be always optimal, but should be close enough. | ||
// I.E. in a system consisting of exactly 2 threads, unlimited spinning may work better, but we | ||
// will not optimize specifically for that. | ||
private const ushort MaxSpinLimit = 20; | ||
private const ushort MinSpinLimit = 3; | ||
private const ushort SpinningNotInitialized = MaxSpinLimit + 1; | ||
private const ushort SpinningDisabled = 0; | ||
|
||
// | ||
// We will use exponential backoff in rare cases when we need to change state atomically and cannot | ||
|
@@ -71,8 +81,8 @@ public sealed class Lock : IDisposable | |
private int _owningThreadId; | ||
private uint _recursionCount; | ||
private int _state; | ||
private uint _spinLimit = SpinningNotInitialized; | ||
private int _wakeWatchDog; | ||
private ushort _spinLimit = SpinningNotInitialized; | ||
private short _wakeWatchDog; | ||
|
||
// used to transfer the state when inflating thin locks | ||
internal void InitializeLocked(int threadId, int recursionCount) | ||
|
@@ -112,7 +122,7 @@ public void Acquire() | |
// | ||
// Fall back to the slow path for contention | ||
// | ||
bool success = TryAcquireContended(currentThreadId, Timeout.Infinite); | ||
bool success = TryAcquireSlow(currentThreadId, Timeout.Infinite); | ||
Debug.Assert(success); | ||
} | ||
|
||
|
@@ -121,36 +131,48 @@ public bool TryAcquire(TimeSpan timeout) | |
return TryAcquire(WaitHandle.ToTimeoutMilliseconds(timeout)); | ||
} | ||
|
||
public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false) | ||
public bool TryAcquire(int millisecondsTimeout) | ||
{ | ||
ArgumentOutOfRangeException.ThrowIfLessThan(millisecondsTimeout, -1); | ||
|
||
int currentThreadId = CurrentThreadId; | ||
if (TryAcquireOneShot(currentThreadId)) | ||
return true; | ||
|
||
// | ||
// Fall back to the slow path for contention | ||
// | ||
return TryAcquireSlow(currentThreadId, millisecondsTimeout, trackContentions: false); | ||
} | ||
|
||
internal bool TryAcquireNoSpin() | ||
{ | ||
// | ||
// Make one quick attempt to acquire an uncontended lock | ||
// | ||
if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended) | ||
{ | ||
Debug.Assert(_owningThreadId == 0); | ||
Debug.Assert(_recursionCount == 0); | ||
_owningThreadId = currentThreadId; | ||
int currentThreadId = CurrentThreadId; | ||
if (TryAcquireOneShot(currentThreadId)) | ||
return true; | ||
} | ||
|
||
// | ||
// Fall back to the slow path for contention | ||
// If we already own the lock, just increment the recursion count. | ||
// | ||
return TryAcquireContended(currentThreadId, millisecondsTimeout, trackContentions); | ||
if (_owningThreadId == currentThreadId) | ||
{ | ||
checked { _recursionCount++; } | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
internal bool TryAcquireOneShot(int currentThreadId) | ||
{ | ||
// | ||
// Make one quick attempt to acquire an uncontended lock | ||
// | ||
if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended) | ||
int origState = _state; | ||
int expectedState = origState & ~(YieldToWaiters | Locked); | ||
int newState = origState | Locked; | ||
if (Interlocked.CompareExchange(ref _state, newState, expectedState) == expectedState) | ||
{ | ||
Debug.Assert(_owningThreadId == 0); | ||
Debug.Assert(_recursionCount == 0); | ||
|
@@ -169,13 +191,14 @@ private static unsafe void ExponentialBackoff(uint iteration) | |
uint rand = ((uint)&iteration + iteration) * 2654435769u; | ||
// set the highmost bit to ensure minimum number of spins is exponentialy increasing | ||
// that is in case some stack location results in a sequence of very low spin counts | ||
rand |= (1 << 32); | ||
// it basically gurantees that we spin at least 1, 2, 4, 8, 16, times, and so on | ||
rand |= (1u << 31); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not relevant to this particular change, but why is this additional randomization necessary? Given that the upper bit is set, it seems the randomization effect would be miniscule, especially at higher iterations. To me it doesn't seem like the added complication would be necessary. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
With upper bit set and other bits random, the range of possible waits should still be ~50% of the minimum for the given iteration. The cost of this "randomization" is basically free next to the cost of occupying the core even for one spinwait cycle. It is a cheap insurance against threads arriving at the same time or synchronizing with each other via effects of cache misses. Some scenarios/environments may not need this, and those that need may need it less after this PR, since we are trying to ensure that we do not see too much contentious spinning in the first place. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, it seems the spin-wait backoff can yield fairly large gaps between checks at higher iterations, whereas CoreCLR's monitor's spin-wait backoff maxes out fairly quickly in favor of reducing latency. I guess a small amount of randomization may not hurt, and as you said it would probably be difficult to prove or disprove if it actually helps. |
||
uint spins = rand >> (byte)(32 - Math.Min(iteration, MaxExponentialBackoffBits)); | ||
Thread.SpinWaitInternal((int)spins); | ||
} | ||
} | ||
|
||
private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, bool trackContentions = false) | ||
internal bool TryAcquireSlow(int currentThreadId, int millisecondsTimeout, bool trackContentions = false) | ||
{ | ||
// | ||
// If we already own the lock, just increment the recursion count. | ||
|
@@ -192,20 +215,32 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b | |
if (millisecondsTimeout == 0) | ||
return false; | ||
|
||
// since we have just made an attempt to accuire and failed, do a small pause | ||
Thread.SpinWaitInternal(1); | ||
|
||
if (_spinLimit == SpinningNotInitialized) | ||
{ | ||
// Use RhGetProcessCpuCount directly to avoid Environment.ProcessorCount->ClassConstructorRunner->Lock->Environment.ProcessorCount cycle | ||
if (s_processorCount == 0) | ||
s_processorCount = RuntimeImports.RhGetProcessCpuCount(); | ||
|
||
_spinLimit = (s_processorCount > 1) ? MaxSpinLimit : SpinningDisabled; | ||
_spinLimit = (s_processorCount > 1) ? MinSpinLimit : SpinningDisabled; | ||
} | ||
|
||
bool hasWaited = false; | ||
// we will retry after waking up | ||
while (true) | ||
{ | ||
uint iteration = 0; | ||
|
||
// We will count when we failed to change the state of the lock and increase pauses | ||
// so that bursts of activity are better tolerated. This should not happen often. | ||
uint collisions = 0; | ||
|
||
// We will track the changes of ownership while we are trying to acquire the lock. | ||
int oldOwner = _owningThreadId; | ||
uint ownerChanged = 0; | ||
|
||
uint localSpinLimit = _spinLimit; | ||
// inner loop where we try acquiring the lock or registering as a waiter | ||
while (true) | ||
|
@@ -224,18 +259,34 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b | |
{ | ||
int newState = oldState | Locked; | ||
if (hasWaited) | ||
newState = (newState - WaiterCountIncrement) & ~WaiterWoken & ~YieldToWaiters; | ||
newState = (newState - WaiterCountIncrement) & ~(WaiterWoken | YieldToWaiters); | ||
|
||
if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) | ||
{ | ||
// GOT THE LOCK!! | ||
if (hasWaited) | ||
_wakeWatchDog = 0; | ||
else | ||
// if spinning was successful, update spin count | ||
if (iteration < localSpinLimit && localSpinLimit < MaxSpinLimit) | ||
_spinLimit = localSpinLimit + 1; | ||
|
||
// GOT THE LOCK!! | ||
// now we can estimate how busy the lock is and adjust spinning accordingly | ||
ushort spinLimit = _spinLimit; | ||
if (ownerChanged != 0) | ||
{ | ||
// The lock has changed ownership while we were trying to acquire it. | ||
// It is a signal that we might want to spin less next time. | ||
// Pursuing a lock that is being "stolen" by other threads is inefficient | ||
// due to cache misses and unnecessary sharing of state that keeps invalidating. | ||
if (spinLimit > MinSpinLimit) | ||
{ | ||
_spinLimit = (ushort)(spinLimit - 1); | ||
} | ||
} | ||
else if (spinLimit < MaxSpinLimit && iteration > spinLimit / 2) | ||
{ | ||
// we used more than 50% of allowed iterations, but the lock does not look very contested, | ||
// we can allow a bit more spinning. | ||
_spinLimit = (ushort)(spinLimit + 1); | ||
} | ||
Comment on lines
+270
to
+288
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A useful aspect of a spin-tuning heuristic would be to avoid spin-waiting when it just uses CPU time but does not help to avoid context-switching. I'm not sure that this is addressed by the spin-tuning heuristic in this change. I suggest creating a test case where spin-waiting is wasted, and to compare the CPU time with profiling, with and without spin-waiting. Ideally, a self-tuning spin-wait heuristic would show roughly the same CPU time usage as disabling spin-waiting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand the concern - this is about a lock that is typically long-held. Note that even if a thread decides to block right away, it will still pay thousands of cycles for blocking/waking. In the first approximation, if max spinning that we allow does not cost much more than the costs of block/awake, then the overall overhead even with the max spin should not be much worse than ideal. We may have an occasional new thread arrive at precise time window around lock release, spin and acquire and thus cause spin count to slowly drift up and reach the max. But it is also possible for such "new" threads to contest the lock with an awoken waiter and reduce the spin. Since the lock is long-held, there cannot be many spinners. If the lock is very popular, eventually all the interested threads will block. I think the possibility of really bad behavior is not high, but it may be possible to further improve the heuristics for the long-held locks. It could be interesting to make some adversarial scenarios to see if we can get in trouble intentionally, and see how likely and how bad it can get. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It seems like the max spin count would use a not-insignificant amount of CPU time relative to context switching, though I haven't measured with this heuristic.
True, but if some other less-frequent lock contention on the same lock happens to increase the spin count, the spin-tuning would not tune down the spin-waiting for a more-frequent lock contention that doesn't benefit from spin-waiting. It may not be a significant issue at the moment since the spin-tuning heuristic seems to favor less spin-waiting anyway. |
||
|
||
Debug.Assert((_state | Locked) != 0); | ||
Debug.Assert(_owningThreadId == 0); | ||
Debug.Assert(_recursionCount == 0); | ||
|
@@ -244,13 +295,26 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b | |
} | ||
} | ||
|
||
// spinning was unsuccessful. reduce spin count. | ||
if (iteration == localSpinLimit && localSpinLimit > MinSpinLimit) | ||
_spinLimit = localSpinLimit - 1; | ||
|
||
if (iteration++ < localSpinLimit) | ||
{ | ||
Thread.SpinWaitInternal(1); | ||
int newOwner = _owningThreadId; | ||
if (newOwner != 0 && newOwner != oldOwner) | ||
{ | ||
ownerChanged++; | ||
oldOwner = newOwner; | ||
} | ||
|
||
if (canAcquire) | ||
{ | ||
collisions++; | ||
} | ||
|
||
// We failed to acquire the lock and want to retry after a pause. | ||
// Ideally we will retry right when the lock becomes free, but we cannot know when that will happen. | ||
// We will use a pause that doubles up on every iteration. It will not be more than 2x worse | ||
// than the ideal guess, while minimizing the number of retries. | ||
// We will allow pauses up to 64~128 spinwaits, or more if there are collisions. | ||
ExponentialBackoff(Math.Min(iteration, 6) + collisions); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems the added collisions counter would make this expontential backoff at lot more than power-of-2-exponential, and perhaps fairly quickly depending on the situation. Is there a reason for this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two factors that inform the backoff here - iteration and collisions. When we see collisions (i.e. our interlocked operations start failing), it is a signal that we may still have more competition at the lock than we would like, so we backoff more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should there be a cap on the collisions too similarly to the iteration, so as to avoid a perhaps rare but potentially very large backoffs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ExponentialBackoff has a total limit controlled by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unless I'm missing something, 10 bits seems like a lot to me as a cap for one spin iteration. If we assume There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of double-up increases, by the time the pause reaches 5us, we have already spent 5us spinning. Basically the horribly long pauses will take horribly long time to achieve. The math kind of tells us that the limit is unnecessary, yet for many people, me included, an unbounded spin would trigger an alarm, so we need some limit. Because, who knows, we may not be considering everything ... There is not much science behind this number though. I wanted to pick something that does not stand in the way in a typical scenario, yet protects from degenerate cases (i.e. not spinning for seconds). Thus I picked If there is any way to better estimate a better number, we could change it. It would be interesting to get access to a 1024 core machine, or how high the core count goes these days, and see how high the collisions numbers could possibly get if trying intentionally for it (also to see if randomization helps). |
||
continue; | ||
} | ||
else if (!canAcquire) | ||
|
@@ -267,10 +331,11 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b | |
|
||
if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) | ||
break; | ||
|
||
collisions++; | ||
} | ||
|
||
Debug.Assert(iteration >= localSpinLimit); | ||
ExponentialBackoff(iteration - localSpinLimit); | ||
ExponentialBackoff(collisions); | ||
} | ||
|
||
// | ||
|
@@ -390,39 +455,31 @@ private void ReleaseCore() | |
{ | ||
Debug.Assert(_recursionCount == 0); | ||
_owningThreadId = 0; | ||
|
||
// | ||
// Make one quick attempt to release an uncontended lock | ||
// | ||
if (Interlocked.CompareExchange(ref _state, Uncontended, Locked) == Locked) | ||
int origState = Interlocked.Decrement(ref _state); | ||
if (origState < WaiterCountIncrement || (origState & WaiterWoken) != 0) | ||
{ | ||
return; | ||
} | ||
|
||
// | ||
// We have waiters; take the slow path. | ||
// | ||
ReleaseContended(); | ||
AwakeWaiterIfNeeded(); | ||
} | ||
|
||
private void ReleaseContended() | ||
private void AwakeWaiterIfNeeded() | ||
{ | ||
Debug.Assert(_recursionCount == 0); | ||
Debug.Assert(_owningThreadId == 0); | ||
|
||
uint iteration = 0; | ||
while (true) | ||
{ | ||
int oldState = _state; | ||
|
||
// clear the lock bit. | ||
int newState = oldState & ~Locked; | ||
|
||
if (oldState >= WaiterCountIncrement && (oldState & WaiterWoken) == 0) | ||
{ | ||
// there are waiters, and nobody has woken one. | ||
newState |= WaiterWoken; | ||
int newState = oldState | WaiterWoken; | ||
|
||
int lastWakeTicks = _wakeWatchDog; | ||
if (lastWakeTicks != 0 && Environment.TickCount - lastWakeTicks > 100) | ||
short lastWakeTicks = _wakeWatchDog; | ||
if (lastWakeTicks != 0 && (short)Environment.TickCount - lastWakeTicks > WaiterWatchdogTicks) | ||
{ | ||
newState |= YieldToWaiters; | ||
} | ||
|
@@ -432,7 +489,7 @@ private void ReleaseContended() | |
if (lastWakeTicks == 0) | ||
{ | ||
// nonzero timestamp of the last wake | ||
_wakeWatchDog = Environment.TickCount | 1; | ||
_wakeWatchDog = (short)(Environment.TickCount | 1); | ||
} | ||
|
||
Event.Set(); | ||
|
@@ -442,8 +499,7 @@ private void ReleaseContended() | |
else | ||
{ | ||
// no need to wake a waiter. | ||
if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) | ||
return; | ||
return; | ||
} | ||
|
||
ExponentialBackoff(iteration++); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This acquire attempt is unnecessary and is also too soon. It was probably here to pick "unusual" cases like recursive locking because
TryAcquire(0)
would defer internally to the slow path for one iteration. Calling the slow path directly is simpler though and just as good.