Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[resubmit] BigInteger parsing optimization for large decimal string #55121

Merged
merged 16 commits into from
Mar 23, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static unsafe uint[] Square(uint[] value)
private static int SquareThreshold = 32;
private static int AllocationThreshold = 256;

private static unsafe void Square(uint* value, int valueLength,
internal static unsafe void Square(uint* value, int valueLength,
uint* bits, int bitsLength)
{
Debug.Assert(valueLength >= 0);
Expand Down Expand Up @@ -208,7 +208,7 @@ public static unsafe uint[] Multiply(uint[] left, uint[] right)
// Mutable for unit testing...
private static int MultiplyThreshold = 32;

private static unsafe void Multiply(uint* left, int leftLength,
internal static unsafe void Multiply(uint* left, int leftLength,
uint* right, int rightLength,
uint* bits, int bitsLength)
{
Expand Down
319 changes: 253 additions & 66 deletions src/libraries/System.Runtime.Numerics/src/System/Numerics/BigNumber.cs
Original file line number Diff line number Diff line change
Expand Up @@ -494,35 +494,277 @@ private static bool HexNumberToBigInteger(ref BigNumberBuffer number, out BigInt
}
}

//
// This threshold is for choosing the algorithm to use based on the number of digits.
//
// Let N be the number of digits. If N is less than or equal to the bound, use a naive
// algorithm with a running time of O(N^2). And if it is greater than the threshold, use
// a divide-and-conquer algorithm with a running time of O(NlogN).
//
private static int s_naiveThreshold = 20000;
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
private static bool NumberToBigInteger(ref BigNumberBuffer number, out BigInteger result)
{
Span<uint> stackBuffer = stackalloc uint[BigInteger.StackallocUInt32Limit];

Span<uint> currentBuffer = stackBuffer;
int currentBufferSize = 0;

int[]? arrayFromPool = null;
int[]? rentedBuffer = null;

uint partialValue = 0;
int partialDigitCount = 0;
int totalDigitCount = 0;
int numberScale = number.scale;

const int MaxPartialDigits = 9;
const uint TenPowMaxPartial = 1000000000;

try
{
foreach (ReadOnlyMemory<char> digitsChunk in number.digits.GetChunks())
if (number.digits.Length <= s_naiveThreshold)
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given how big each path is, I would if it would be better to break them into 2 helper methods. Basically leaving:

if (number.digits.Length <= s_naiveThreshold)
{
    AlgorithmA(...);
}
else
{
    AlgorithmB(...);
}

-- The method is getting pretty big, which means the JIT might give up on optimizing it otherwise (haven't confirmed if it actually does).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for late replying. I think this is worth doing in terms of improving readability. I will implement it as soon as possible.

Copy link
Contributor Author

@key-moon key-moon Oct 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have implemented and benchmarked. As a result, there is no significant difference in speed and memory. However, readability has definitely been improved.

Benchmark result
  • Job-VEBSQX: before split to methods (a9942c5)
  • Job-HLAVXS: after split to methods (460664f)
BenchmarkDotNet=v0.13.1.1611-nightly, OS=Windows 10.0.22000
11th Gen Intel Core i7-1165G7 2.80GHz, 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.100-rc.1.21463.6
  [Host]     : .NET 5.0.9 (5.0.921.35908), X64 RyuJIT
  Job-VEBSQX : .NET 6.0.0 (42.42.42.42424), X64 RyuJIT
  Job-HLAVXS : .NET 6.0.0 (42.42.42.42424), X64 RyuJIT

PowerPlanMode=00000000-0000-0000-0000-000000000000  Arguments=/p:DebugType=portable,-bl:benchmarkdotnet.binlog  IterationTime=250.0000 ms  
MaxIterationCount=20  MinIterationCount=15  WarmupCount=1  
Method Job Toolchain numberString Mean Error StdDev Median Min Max Ratio RatioSD Gen 0 Gen 1 Gen 2 Allocated
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [50000] 10.50 ms 0.350 ms 0.389 ms 10.56 ms 9.902 ms 11.42 ms 1.00 0.00 312.5000 93.7500 - 2 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [50000] 10.29 ms 0.384 ms 0.427 ms 10.34 ms 9.563 ms 11.13 ms 0.98 0.03 312.5000 93.7500 - 2 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [100000] 36.77 ms 1.531 ms 1.702 ms 36.58 ms 33.684 ms 40.74 ms 1.00 0.00 1125.0000 250.0000 - 7 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [100000] 36.64 ms 1.445 ms 1.606 ms 36.23 ms 33.401 ms 39.95 ms 1.00 0.07 1166.6667 166.6667 - 7 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [150000] 41.08 ms 1.350 ms 1.555 ms 40.75 ms 38.486 ms 44.25 ms 1.00 0.00 1000.0000 500.0000 - 7 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [150000] 40.97 ms 0.742 ms 0.762 ms 41.26 ms 39.155 ms 42.20 ms 1.01 0.04 1000.0000 500.0000 - 7 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [200000] 129.14 ms 3.434 ms 3.527 ms 129.89 ms 120.135 ms 133.89 ms 1.00 0.00 4500.0000 500.0000 - 28 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [200000] 130.05 ms 2.571 ms 2.279 ms 130.45 ms 125.832 ms 134.43 ms 1.01 0.04 4500.0000 500.0000 - 28 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [250000] 160.84 ms 8.920 ms 9.544 ms 158.21 ms 146.879 ms 179.63 ms 1.00 0.00 5500.0000 500.0000 - 34 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [250000] 162.85 ms 7.046 ms 8.114 ms 161.09 ms 145.608 ms 178.37 ms 1.01 0.08 5500.0000 500.0000 - 34 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [300000] 132.28 ms 3.731 ms 3.992 ms 131.79 ms 122.929 ms 137.93 ms 1.00 0.00 3500.0000 1500.0000 1000.0000 24 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [300000] 134.72 ms 5.300 ms 6.103 ms 133.63 ms 123.651 ms 147.27 ms 1.03 0.06 3500.0000 1500.0000 1000.0000 24 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [350000] 356.66 ms 20.078 ms 21.484 ms 351.60 ms 321.365 ms 406.99 ms 1.00 0.00 14000.0000 12000.0000 11000.0000 71 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [350000] 354.54 ms 17.497 ms 19.448 ms 348.05 ms 323.012 ms 396.16 ms 0.99 0.07 15000.0000 13000.0000 12000.0000 71 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [400000] 524.19 ms 47.271 ms 54.437 ms 510.63 ms 461.465 ms 640.54 ms 1.00 0.00 17000.0000 3000.0000 2000.0000 107 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [400000] 553.00 ms 91.333 ms 105.179 ms 487.60 ms 444.111 ms 710.82 ms 1.06 0.21 17000.0000 3000.0000 2000.0000 107 MB
Parse Job-VEBSQX \artifacts-a9942c\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [450000] 627.40 ms 91.780 ms 105.695 ms 570.04 ms 526.129 ms 800.22 ms 1.00 0.00 24000.0000 1000.0000 - 149 MB
Parse Job-HLAVXS \artifacts-460664\bin\testhost\net6.0-windows-Release-x64\shared\Microsoft.NETCore.App\6.0.0\corerun.exe 12345678901(...)01234567890 [450000] 554.17 ms 49.428 ms 56.921 ms 542.68 ms 472.918 ms 657.23 ms 0.90 0.14 24000.0000 1000.0000 - 149 MB

if (!ProcessChunk(digitsChunk.Span, ref currentBuffer))
uint partialValue = 0;
int partialDigitCount = 0;

foreach (ReadOnlyMemory<char> digitsChunk in number.digits.GetChunks())
{
if (!ProcessChunk(digitsChunk.Span, ref currentBuffer))
{
result = default;
return false;
}
}

if (partialDigitCount > 0)
{
MultiplyAdd(ref currentBuffer, s_uint32PowersOfTen[partialDigitCount], partialValue);
}

tannergooding marked this conversation as resolved.
Show resolved Hide resolved
bool ProcessChunk(ReadOnlySpan<char> chunkDigits, ref Span<uint> currentBuffer)
{
int remainingIntDigitCount = Math.Max(numberScale - totalDigitCount, 0);
ReadOnlySpan<char> intDigitsSpan = chunkDigits.Slice(0, Math.Min(remainingIntDigitCount, chunkDigits.Length));

bool endReached = false;

// Storing these captured variables in locals for faster access in the loop.
uint _partialValue = partialValue;
int _partialDigitCount = partialDigitCount;
int _totalDigitCount = totalDigitCount;

for (int i = 0; i < intDigitsSpan.Length; i++)
{
char digitChar = chunkDigits[i];
if (digitChar == '\0')
{
endReached = true;
break;
}

_partialValue = _partialValue * 10 + (uint)(digitChar - '0');
_partialDigitCount++;
_totalDigitCount++;

// Update the buffer when enough partial digits have been accumulated.
if (_partialDigitCount == MaxPartialDigits)
{
MultiplyAdd(ref currentBuffer, TenPowMaxPartial, _partialValue);
_partialValue = 0;
_partialDigitCount = 0;
}
}

// Check for nonzero digits after the decimal point.
if (!endReached)
{
ReadOnlySpan<char> fracDigitsSpan = chunkDigits.Slice(intDigitsSpan.Length);
for (int i = 0; i < fracDigitsSpan.Length; i++)
{
char digitChar = fracDigitsSpan[i];
if (digitChar == '\0')
{
break;
}
if (digitChar != '0')
{
return false;
}
}
}

partialValue = _partialValue;
partialDigitCount = _partialDigitCount;
totalDigitCount = _totalDigitCount;

return true;
}
}
else
{
if (numberScale < 0)
key-moon marked this conversation as resolved.
Show resolved Hide resolved
{
result = default;
return false;
}
}
totalDigitCount = Math.Min(number.digits.Length - 1, numberScale);
int bufferSize = (totalDigitCount + MaxPartialDigits - 1) / MaxPartialDigits;

if (partialDigitCount > 0)
{
MultiplyAdd(ref currentBuffer, s_uint32PowersOfTen[partialDigitCount], partialValue);
Span<uint> buffer = new uint[bufferSize];
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
rentedBuffer = ArrayPool<int>.Shared.Rent(bufferSize);
Span<uint> newBuffer = MemoryMarshal.Cast<int, uint>(rentedBuffer);
newBuffer.Clear();

// To ensure finally stored in newBuffer is the borrowed buffer.
int blockSize = 1;
do
{
Span<uint> tmp = buffer;
buffer = newBuffer;
newBuffer = tmp;
blockSize *= 2;
} while (blockSize < bufferSize);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved

// Separate every MaxPartialDigits digits and store them in the buffer.
// Buffers are treated as little-endian. That means, the array { 234567890, 1 }
// represents the number 1234567890.
int bufferIndex = bufferSize - 1;
uint currentBlock = 0;
int shiftUntil = (totalDigitCount - 1) % MaxPartialDigits;
int remainingIntDigitCount = totalDigitCount;
foreach (ReadOnlyMemory<char> digitsChunk in number.digits.GetChunks())
{
ReadOnlySpan<char> digitsChunkSpan = digitsChunk.Span;
ReadOnlySpan<char> intDigitsSpan = digitsChunkSpan.Slice(0, Math.Min(remainingIntDigitCount, digitsChunkSpan.Length));

for (int i = 0; i < intDigitsSpan.Length; i++)
{
char digitChar = intDigitsSpan[i];
Debug.Assert(char.IsDigit(digitChar));
currentBlock *= 10;
currentBlock += unchecked((uint)(digitChar - '0'));
if (shiftUntil == 0)
{
buffer[bufferIndex] = currentBlock;
currentBlock = 0;
bufferIndex--;
shiftUntil = MaxPartialDigits;
}
shiftUntil--;
}
remainingIntDigitCount -= intDigitsSpan.Length;
key-moon marked this conversation as resolved.
Show resolved Hide resolved
Debug.Assert(0 <= remainingIntDigitCount);

ReadOnlySpan<char> fracDigitsSpan = digitsChunkSpan.Slice(intDigitsSpan.Length);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < fracDigitsSpan.Length; i++)
{
char digitChar = fracDigitsSpan[i];
if (digitChar == '\0')
{
break;
}
if (digitChar != '0')
{
result = default;
return false;
}
}
}
Debug.Assert(currentBlock == 0);
Debug.Assert(bufferIndex == -1);

unsafe
{
arrayFromPool = ArrayPool<int>.Shared.Rent(1);
Span<uint> multiplier = MemoryMarshal.Cast<int, uint>(arrayFromPool);
multiplier[0] = TenPowMaxPartial;
bartonjs marked this conversation as resolved.
Show resolved Hide resolved

blockSize = 1;
while (true)
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
{
fixed (uint* bufPtr = buffer, newBufPtr = newBuffer, mulPtr = multiplier)
{
uint* curBufPtr = bufPtr;
uint* curNewBufPtr = newBufPtr;
// merge each block pairs.
// When buffer represents:
// | A | B | C | D |
// Make newBuffer like:
// | A + B * multiplier | C * multiplier + D |
for (int i = 0; i < bufferSize; i += blockSize * 2)
{
int len = Math.Min(bufferSize - i, blockSize * 2);
int lowerLen = Math.Min(len, blockSize);
int upperLen = len - lowerLen;
if (upperLen != 0)
{
BigIntegerCalculator.Multiply(mulPtr, blockSize, curBufPtr + blockSize, upperLen, curNewBufPtr, len);
}

long carry = 0;
int j = 0;
for (; j < lowerLen; j++)
{
long digit = (curBufPtr[j] + carry) + curNewBufPtr[j];
curNewBufPtr[j] = unchecked((uint)digit);
carry = digit >> 32;
}
if (carry != 0)
{
while (true)
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
{
curNewBufPtr[j]++;
if (curNewBufPtr[j] != 0)
{
break;
}
j++;
}
}

curBufPtr += blockSize * 2;
curNewBufPtr += blockSize * 2;
}
}

Span<uint> tmp = buffer;
buffer = newBuffer;
newBuffer = tmp;
blockSize *= 2;

if (bufferSize <= blockSize)
{
break;
}
newBuffer.Clear();
int[]? arrayToReturn = arrayFromPool;

arrayFromPool = ArrayPool<int>.Shared.Rent(blockSize);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the last rented array is never returned, nor stored anywhere.

Copy link
Contributor Author

@key-moon key-moon Oct 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It returned in the finally block. It's very confusing, so I should make more readable. I'm going to do in the process of function separation(#55121 (comment) )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a sense of how many times this process runs... there's a lot of renting and returning going on.

Assuming that BigIntegerCalculator can't square in place, can we get it down to two rents and two returns by just renting two things that are big enough at the beginning, returning them at the end, and toggling between them in the square step?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This loop is executed ceil(log_2 bufferSize) times, which is at most 30 times. Maybe I should notate in the comment.
This costs substantially nothing compare to other operations executed in the loop, such as square operation.

Span<uint> newMultiplier = MemoryMarshal.Cast<int, uint>(arrayFromPool);
newMultiplier.Clear();
fixed (uint* mulPtr = multiplier, newMulPtr = newMultiplier)
{
BigIntegerCalculator.Square(mulPtr, blockSize / 2, newMulPtr, blockSize);
}
multiplier = newMultiplier;
if (arrayToReturn is not null)
{
ArrayPool<int>.Shared.Return(arrayToReturn);
}
}
}

Debug.Assert(MemoryMarshal.Cast<int, uint>(rentedBuffer) == newBuffer);

// shrink buffer to the currently used portion.
// First, calculate the rough size of the buffer from the ratio that the number
// of digits follows. Then, shrink the size until there is no more space left.
// The Ratio is calculated as: log_{2^32}(10^9)
const double digitRatio = 0.934292276687070661;
currentBufferSize = Math.Min((int)(bufferSize * digitRatio) + 1, bufferSize);
Debug.Assert(buffer.Length == currentBufferSize || buffer[currentBufferSize] == 0);
while (0 <= currentBufferSize - 1 && buffer[currentBufferSize - 1] == 0)
key-moon marked this conversation as resolved.
Show resolved Hide resolved
{
currentBufferSize--;
}
currentBuffer = buffer.Slice(0, currentBufferSize);
}

int trailingZeroCount = numberScale - totalDigitCount;
Expand Down Expand Up @@ -566,65 +808,10 @@ private static bool NumberToBigInteger(ref BigNumberBuffer number, out BigIntege
{
ArrayPool<int>.Shared.Return(arrayFromPool);
}
}

bool ProcessChunk(ReadOnlySpan<char> chunkDigits, ref Span<uint> currentBuffer)
{
int remainingIntDigitCount = Math.Max(numberScale - totalDigitCount, 0);
ReadOnlySpan<char> intDigitsSpan = chunkDigits.Slice(0, Math.Min(remainingIntDigitCount, chunkDigits.Length));

bool endReached = false;

// Storing these captured variables in locals for faster access in the loop.
uint _partialValue = partialValue;
int _partialDigitCount = partialDigitCount;
int _totalDigitCount = totalDigitCount;

for (int i = 0; i < intDigitsSpan.Length; i++)
{
char digitChar = chunkDigits[i];
if (digitChar == '\0')
{
endReached = true;
break;
}

_partialValue = _partialValue * 10 + (uint)(digitChar - '0');
_partialDigitCount++;
_totalDigitCount++;

// Update the buffer when enough partial digits have been accumulated.
if (_partialDigitCount == MaxPartialDigits)
{
MultiplyAdd(ref currentBuffer, TenPowMaxPartial, _partialValue);
_partialValue = 0;
_partialDigitCount = 0;
}
}

// Check for nonzero digits after the decimal point.
if (!endReached)
if (rentedBuffer != null)
{
ReadOnlySpan<char> fracDigitsSpan = chunkDigits.Slice(intDigitsSpan.Length);
for (int i = 0; i < fracDigitsSpan.Length; i++)
{
char digitChar = fracDigitsSpan[i];
if (digitChar == '\0')
{
break;
}
if (digitChar != '0')
{
return false;
}
}
ArrayPool<int>.Shared.Return(rentedBuffer);
}

partialValue = _partialValue;
partialDigitCount = _partialDigitCount;
totalDigitCount = _totalDigitCount;

return true;
}

void MultiplyAdd(ref Span<uint> currentBuffer, uint multiplier, uint addValue)
Expand Down
Loading