From d4dcde13c969ad20deb20c99e91880de44aac9cd Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 16 Jul 2021 10:49:42 -0400 Subject: [PATCH] Streamline rent/return on ArrayPool (#55710) * Streamline rent/return on ArrayPool - Stop storing and using a _bucketArraySizes. It's cheaper to recompute the shift on each use than it is to index into the array (with a bounds check). Plus less memory. - The 99% case is renting a positive length for pooled array sizes (especially now that we've bumped the limit up to a gig). Move the checks for lengths <= 0 to after the check for whether the length is poolable. - Move arrays into locals to enable the JIT to eliminate some bounds checks. - Use ThrowHelpers where we already have them - Move non-generic helpers out of generic class into Utilities - Consolidate buffer allocation in Rent to a single line - Reorganize TLS checks to be as early as possible - Use FastMod instead of % in per-core stacks * Address PR feedback --- .../TlsOverPerCoreLockedStacksArrayPool.cs | 286 +++++++----------- .../src/System/Buffers/Utilities.cs | 29 +- 2 files changed, 136 insertions(+), 179 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs index 438ca8e9cb204..3f75953261426 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs @@ -34,37 +34,18 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool : ArrayPool /// The maximum number of buffers to store in a bucket's global queue. private const int MaxBuffersPerArraySizePerCore = 8; - /// The length of arrays stored in the corresponding indices in and . - private readonly int[] _bucketArraySizes; - /// - /// An array of per-core array stacks. The slots are lazily initialized to avoid creating - /// lots of overhead for unused array sizes. - /// - private readonly PerCoreLockedStacks?[] _buckets = new PerCoreLockedStacks[NumBuckets]; /// A per-thread array of arrays, to cache one array per array size per thread. [ThreadStatic] private static T[]?[]? t_tlsBuckets; - - private int _callbackCreated; - - private static readonly bool s_trimBuffers = GetTrimBuffers(); - + /// Used to keep track of all thread local buckets for trimming if needed. + private readonly ConditionalWeakTable _allTlsBuckets = new ConditionalWeakTable(); /// - /// Used to keep track of all thread local buckets for trimming if needed + /// An array of per-core array stacks. The slots are lazily initialized to avoid creating + /// lots of overhead for unused array sizes. /// - private static readonly ConditionalWeakTable? s_allTlsBuckets = - s_trimBuffers ? new ConditionalWeakTable() : null; - - /// Initialize the pool. - public TlsOverPerCoreLockedStacksArrayPool() - { - var sizes = new int[NumBuckets]; - for (int i = 0; i < sizes.Length; i++) - { - sizes[i] = Utilities.GetMaxSizeForBucket(i); - } - _bucketArraySizes = sizes; - } + private readonly PerCoreLockedStacks?[] _buckets = new PerCoreLockedStacks[NumBuckets]; + /// Whether the callback to trim arrays in response to memory pressure has been created. + private int _trimCallbackCreated; /// Allocate a new PerCoreLockedStacks and try to store it into the array. private PerCoreLockedStacks CreatePerCoreLockedStacks(int bucketIndex) @@ -78,51 +59,38 @@ private PerCoreLockedStacks CreatePerCoreLockedStacks(int bucketIndex) public override T[] Rent(int minimumLength) { - // Arrays can't be smaller than zero. We allow requesting zero-length arrays (even though - // pooling such an array isn't valuable) as it's a valid length array, and we want the pool - // to be usable in general instead of using `new`, even for computed lengths. - if (minimumLength < 0) - { - throw new ArgumentOutOfRangeException(nameof(minimumLength)); - } - else if (minimumLength == 0) - { - // No need to log the empty array. Our pool is effectively infinite - // and we'll never allocate for rents and never store for returns. - return Array.Empty(); - } - ArrayPoolEventSource log = ArrayPoolEventSource.Log; T[]? buffer; - // Get the bucket number for the array length + // Get the bucket number for the array length. The result may be out of range of buckets, + // either for too large a value or for 0 and negative values. int bucketIndex = Utilities.SelectBucketIndex(minimumLength); - // If the array could come from a bucket... - if (bucketIndex < _buckets.Length) + // First, try to get an array from TLS if possible. + T[]?[]? tlsBuckets = t_tlsBuckets; + if (tlsBuckets is not null && (uint)bucketIndex < (uint)tlsBuckets.Length) { - // First try to get it from TLS if possible. - T[]?[]? tlsBuckets = t_tlsBuckets; - if (tlsBuckets != null) + buffer = tlsBuckets[bucketIndex]; + if (buffer is not null) { - buffer = tlsBuckets[bucketIndex]; - if (buffer != null) + tlsBuckets[bucketIndex] = null; + if (log.IsEnabled()) { - tlsBuckets[bucketIndex] = null; - if (log.IsEnabled()) - { - log.BufferRented(buffer.GetHashCode(), buffer.Length, Id, bucketIndex); - } - return buffer; + log.BufferRented(buffer.GetHashCode(), buffer.Length, Id, bucketIndex); } + return buffer; } + } - // We couldn't get a buffer from TLS, so try the global stack. - PerCoreLockedStacks? b = _buckets[bucketIndex]; - if (b != null) + // Next, try to get an array from one of the per-core stacks. + PerCoreLockedStacks?[] perCoreBuckets = _buckets; + if ((uint)bucketIndex < (uint)perCoreBuckets.Length) + { + PerCoreLockedStacks? b = perCoreBuckets[bucketIndex]; + if (b is not null) { buffer = b.TryPop(); - if (buffer != null) + if (buffer is not null) { if (log.IsEnabled()) { @@ -132,16 +100,24 @@ public override T[] Rent(int minimumLength) } } - // No buffer available. Allocate a new buffer with a size corresponding to the appropriate bucket. - buffer = GC.AllocateUninitializedArray(_bucketArraySizes[bucketIndex]); + // No buffer available. Ensure the length we'll allocate matches that of a bucket + // so we can later return it. + minimumLength = Utilities.GetMaxSizeForBucket(bucketIndex); } - else + else if (minimumLength == 0) { - // The request was for a size too large for the pool. Allocate an array of exactly the requested length. - // When it's returned to the pool, we'll simply throw it away. - buffer = GC.AllocateUninitializedArray(minimumLength); + // We allow requesting zero-length arrays (even though pooling such an array isn't valuable) + // as it's a valid length array, and we want the pool to be usable in general instead of using + // `new`, even for computed lengths. But, there's no need to log the empty array. Our pool is + // effectively infinite for empty arrays and we'll never allocate for rents and never store for returns. + return Array.Empty(); + } + else if (minimumLength < 0) + { + throw new ArgumentOutOfRangeException(nameof(minimumLength)); } + buffer = GC.AllocateUninitializedArray(minimumLength); if (log.IsEnabled()) { int bufferId = buffer.GetHashCode(); @@ -150,73 +126,58 @@ public override T[] Rent(int minimumLength) ArrayPoolEventSource.BufferAllocatedReason.OverMaximumSize : ArrayPoolEventSource.BufferAllocatedReason.PoolExhausted); } - return buffer; } public override void Return(T[] array, bool clearArray = false) { - if (array == null) + if (array is null) { - throw new ArgumentNullException(nameof(array)); + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.array); } // Determine with what bucket this array length is associated int bucketIndex = Utilities.SelectBucketIndex(array.Length); - // If we can tell that the buffer was allocated (or empty), drop it. Otherwise, check if we have space in the pool. + // Make sure our TLS buckets are initialized. Technically we could avoid doing + // this if the array being returned is erroneous or too large for the pool, but the + // former condition is an error we don't need to optimize for, and the latter is incredibly + // rare, given a max size of 1B elements. + T[]?[] tlsBuckets = t_tlsBuckets ?? InitializeTlsBucketsAndTrimming(); + + bool haveBucket = false; bool returned = true; - bool haveBucket = bucketIndex < _buckets.Length; - if (haveBucket) + if ((uint)bucketIndex < (uint)tlsBuckets.Length) { - // Clear the array if the user requests. + haveBucket = true; + + // Clear the array if the user requested it. if (clearArray) { Array.Clear(array); } - // Check to see if the buffer is the correct size for this bucket - if (array.Length != _bucketArraySizes[bucketIndex]) + // Check to see if the buffer is the correct size for this bucket. + if (array.Length != Utilities.GetMaxSizeForBucket(bucketIndex)) { throw new ArgumentException(SR.ArgumentException_BufferNotFromPool, nameof(array)); } - // Write through the TLS bucket. If there weren't any buckets, create them - // and store this array into it. If there were, store this into it, and - // if there was a previous one there, push that to the global stack. This - // helps to keep LIFO access such that the most recently pushed stack will - // be in TLS and the first to be popped next. - T[]?[]? tlsBuckets = t_tlsBuckets; - if (tlsBuckets == null) - { - t_tlsBuckets = tlsBuckets = new T[NumBuckets][]; - tlsBuckets[bucketIndex] = array; - if (s_trimBuffers) - { - Debug.Assert(s_allTlsBuckets != null, "Should be non-null iff s_trimBuffers is true"); - s_allTlsBuckets.Add(tlsBuckets, null); - if (Interlocked.Exchange(ref _callbackCreated, 1) != 1) - { - Gen2GcCallback.Register(Gen2GcCallbackFunc, this); - } - } - } - else + // Store the array into the TLS bucket. If there's already an array in it, + // push that array down into the per-core stacks, preferring to keep the latest + // one in TLS for better locality. + T[]? prev = tlsBuckets[bucketIndex]; + tlsBuckets[bucketIndex] = array; + if (prev is not null) { - T[]? prev = tlsBuckets[bucketIndex]; - tlsBuckets[bucketIndex] = array; - - if (prev != null) - { - PerCoreLockedStacks stackBucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex); - returned = stackBucket.TryPush(prev); - } + PerCoreLockedStacks stackBucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex); + returned = stackBucket.TryPush(prev); } } // Log that the buffer was returned ArrayPoolEventSource log = ArrayPoolEventSource.Log; - if (log.IsEnabled()) + if (log.IsEnabled() && array.Length != 0) { log.BufferReturned(array.GetHashCode(), array.Length, Id); if (!(haveBucket & returned)) @@ -230,11 +191,8 @@ public override void Return(T[] array, bool clearArray = false) public bool Trim() { - Debug.Assert(s_trimBuffers); - Debug.Assert(s_allTlsBuckets != null); - int milliseconds = Environment.TickCount; - MemoryPressure pressure = GetMemoryPressure(); + Utilities.MemoryPressure pressure = Utilities.GetMemoryPressure(); ArrayPoolEventSource log = ArrayPoolEventSource.Log; if (log.IsEnabled()) @@ -245,21 +203,21 @@ public bool Trim() PerCoreLockedStacks?[] perCoreBuckets = _buckets; for (int i = 0; i < perCoreBuckets.Length; i++) { - perCoreBuckets[i]?.Trim((uint)milliseconds, Id, pressure, _bucketArraySizes[i]); + perCoreBuckets[i]?.Trim((uint)milliseconds, Id, pressure, Utilities.GetMaxSizeForBucket(i)); } - if (pressure == MemoryPressure.High) + if (pressure == Utilities.MemoryPressure.High) { // Under high pressure, release all thread locals if (log.IsEnabled()) { - foreach (KeyValuePair tlsBuckets in s_allTlsBuckets) + foreach (KeyValuePair tlsBuckets in _allTlsBuckets) { T[]?[] buckets = tlsBuckets.Key; for (int i = 0; i < buckets.Length; i++) { T[]? buffer = Interlocked.Exchange(ref buckets[i], null); - if (buffer != null) + if (buffer is not null) { // As we don't want to take a perf hit in the rent path it // is possible that a buffer could be rented as we "free" it. @@ -270,10 +228,9 @@ public bool Trim() } else { - foreach (KeyValuePair tlsBuckets in s_allTlsBuckets) + foreach (KeyValuePair tlsBuckets in _allTlsBuckets) { - T[]?[] buckets = tlsBuckets.Key; - Array.Clear(buckets); + Array.Clear(tlsBuckets.Key); } } } @@ -281,61 +238,27 @@ public bool Trim() return true; } - /// - /// This is the static function that is called from the gen2 GC callback. - /// The input object is the instance we want the callback on. - /// - /// - /// The reason that we make this function static and take the instance as a parameter is that - /// we would otherwise root the instance to the Gen2GcCallback object, leaking the instance even when - /// the application no longer needs it. - /// - private static bool Gen2GcCallbackFunc(object target) - { - return ((TlsOverPerCoreLockedStacksArrayPool)(target)).Trim(); - } - - private enum MemoryPressure + private T[]?[] InitializeTlsBucketsAndTrimming() { - Low, - Medium, - High - } + Debug.Assert(t_tlsBuckets is null); - private static MemoryPressure GetMemoryPressure() - { - const double HighPressureThreshold = .90; // Percent of GC memory pressure threshold we consider "high" - const double MediumPressureThreshold = .70; // Percent of GC memory pressure threshold we consider "medium" + T[]?[]? tlsBuckets = new T[NumBuckets][]; + t_tlsBuckets = tlsBuckets; - GCMemoryInfo memoryInfo = GC.GetGCMemoryInfo(); - if (memoryInfo.MemoryLoadBytes >= memoryInfo.HighMemoryLoadThresholdBytes * HighPressureThreshold) + _allTlsBuckets.Add(tlsBuckets, null); + if (Interlocked.Exchange(ref _trimCallbackCreated, 1) == 0) { - return MemoryPressure.High; + Gen2GcCallback.Register(s => ((TlsOverPerCoreLockedStacksArrayPool)s).Trim(), this); } - else if (memoryInfo.MemoryLoadBytes >= memoryInfo.HighMemoryLoadThresholdBytes * MediumPressureThreshold) - { - return MemoryPressure.Medium; - } - return MemoryPressure.Low; - } - private static bool GetTrimBuffers() - { - // Environment uses ArrayPool, so we have to hit the API directly. -#if !CORECLR - // P/Invokes are different for CoreCLR/RT- for RT we'll not allow - // enabling/disabling for now. - return true; -#else - return CLRConfig.GetBoolValueWithFallbacks("System.Buffers.ArrayPool.TrimShared", "DOTNET_SYSTEM_BUFFERS_ARRAYPOOL_TRIMSHARED", defaultValue: true); -#endif + return tlsBuckets; } - /// - /// Stores a set of stacks of arrays, with one stack per core. - /// + /// Stores a set of stacks of arrays, with one stack per core. private sealed class PerCoreLockedStacks { + /// Number of locked stacks to employ. + private static readonly int s_lockedStackCount = Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks); /// The stacks. private readonly LockedStack[] _perCoreStacks; @@ -343,7 +266,7 @@ private sealed class PerCoreLockedStacks public PerCoreLockedStacks() { // Create the stacks. We create as many as there are processors, limited by our max. - var stacks = new LockedStack[Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks)]; + var stacks = new LockedStack[s_lockedStackCount]; for (int i = 0; i < stacks.Length; i++) { stacks[i] = new LockedStack(); @@ -372,20 +295,19 @@ public bool TryPush(T[] array) [MethodImpl(MethodImplOptions.AggressiveInlining)] public T[]? TryPop() { - // Try to pop from the associated stack first. If that fails, - // round-robin through the other stacks. + // Try to pop from the associated stack first. If that fails, round-robin through the other stacks. T[]? arr; LockedStack[] stacks = _perCoreStacks; - int index = Thread.GetCurrentProcessorId() % stacks.Length; + int index = Thread.GetCurrentProcessorId() % s_lockedStackCount; // when ProcessorCount is a power of two, the JIT can optimize this in tier 1 for (int i = 0; i < stacks.Length; i++) { - if ((arr = stacks[index].TryPop()) != null) return arr; + if ((arr = stacks[index].TryPop()) is not null) return arr; if (++index == stacks.Length) index = 0; } return null; } - public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize) + public void Trim(uint tickCount, int id, Utilities.MemoryPressure pressure, int bucketSize) { LockedStack[] stacks = _perCoreStacks; for (int i = 0; i < stacks.Length; i++) @@ -395,7 +317,7 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize } } - /// Provides a simple stack of arrays, protected by a lock. + /// Provides a simple, bounded stack of arrays, protected by a lock. private sealed class LockedStack { private readonly T[]?[] _arrays = new T[MaxBuffersPerArraySizePerCore][]; @@ -407,15 +329,18 @@ public bool TryPush(T[] array) { bool enqueued = false; Monitor.Enter(this); - if (_count < MaxBuffersPerArraySizePerCore) + T[]?[] arrays = _arrays; + int count = _count; + if ((uint)count < (uint)arrays.Length) { - if (s_trimBuffers && _count == 0) + if (count == 0) { // Stash the time the bottom of the stack was filled _firstStackItemMS = (uint)Environment.TickCount; } - _arrays[_count++] = array; + arrays[count] = array; + _count = count + 1; enqueued = true; } Monitor.Exit(this); @@ -427,16 +352,19 @@ public bool TryPush(T[] array) { T[]? arr = null; Monitor.Enter(this); - if (_count > 0) + T[]?[] arrays = _arrays; + int count = _count - 1; + if ((uint)count < (uint)arrays.Length) { - arr = _arrays[--_count]; - _arrays[_count] = null; + arr = arrays[count]; + arrays[count] = null; + _count = count; } Monitor.Exit(this); return arr; } - public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize) + public void Trim(uint tickCount, int id, Utilities.MemoryPressure pressure, int bucketSize) { const uint StackTrimAfterMS = 60 * 1000; // Trim after 60 seconds for low/moderate pressure const uint StackHighTrimAfterMS = 10 * 1000; // Trim after 10 seconds for high pressure @@ -449,8 +377,11 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize const int StackLargeTypeSize = 32; // If T is larger than this we'll trim an extra (additional) when under high pressure if (_count == 0) + { return; - uint trimTicks = pressure == MemoryPressure.High ? StackHighTrimAfterMS : StackTrimAfterMS; + } + + uint trimTicks = pressure == Utilities.MemoryPressure.High ? StackHighTrimAfterMS : StackTrimAfterMS; lock (this) { @@ -464,7 +395,7 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize int trimCount = StackLowTrimCount; switch (pressure) { - case MemoryPressure.High: + case Utilities.MemoryPressure.High: trimCount = StackHighTrimCount; // When pressure is high, aggressively trim larger arrays. @@ -481,7 +412,8 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize trimCount++; } break; - case MemoryPressure.Medium: + + case Utilities.MemoryPressure.Medium: trimCount = StackMediumTrimCount; break; } @@ -489,7 +421,7 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize while (_count > 0 && trimCount-- > 0) { T[]? array = _arrays[--_count]; - Debug.Assert(array != null, "No nulls should have been present in slots < _count."); + Debug.Assert(array is not null, "No nulls should have been present in slots < _count."); _arrays[_count] = null; if (log.IsEnabled()) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Utilities.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Utilities.cs index b7b71ec51624a..c2ca163a421b4 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Utilities.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Utilities.cs @@ -12,8 +12,6 @@ internal static class Utilities [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int SelectBucketIndex(int bufferSize) { - Debug.Assert(bufferSize >= 0); - // Buffers are bucketed so that a request between 2^(n-1) + 1 and 2^n is given a buffer of 2^n // Bucket index is log2(bufferSize - 1) with the exception that buffers between 1 and 16 bytes // are combined, and the index is slid down by 3 to compensate. @@ -29,5 +27,32 @@ internal static int GetMaxSizeForBucket(int binIndex) Debug.Assert(maxSize >= 0); return maxSize; } + + internal enum MemoryPressure + { + Low, + Medium, + High + } + + internal static MemoryPressure GetMemoryPressure() + { + const double HighPressureThreshold = .90; // Percent of GC memory pressure threshold we consider "high" + const double MediumPressureThreshold = .70; // Percent of GC memory pressure threshold we consider "medium" + + GCMemoryInfo memoryInfo = GC.GetGCMemoryInfo(); + + if (memoryInfo.MemoryLoadBytes >= memoryInfo.HighMemoryLoadThresholdBytes * HighPressureThreshold) + { + return MemoryPressure.High; + } + + if (memoryInfo.MemoryLoadBytes >= memoryInfo.HighMemoryLoadThresholdBytes * MediumPressureThreshold) + { + return MemoryPressure.Medium; + } + + return MemoryPressure.Low; + } } }