Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce allocations in string.Normalize #34774

Merged
merged 9 commits into from
Apr 10, 2020
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ internal static partial class Interop
internal static partial class Globalization
{
[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_IsNormalized")]
internal static extern int IsNormalized(NormalizationForm normalizationForm, string src, int srcLen);
internal static extern unsafe int IsNormalized(NormalizationForm normalizationForm, char* src, int srcLen);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_NormalizeString")]
internal static extern int NormalizeString(NormalizationForm normalizationForm, string src, int srcLen, [Out] char[] dstBuffer, int dstBufferCapacity);
internal static extern unsafe int NormalizeString(NormalizationForm normalizationForm, char* src, int srcLen, char* dstBuffer, int dstBufferCapacity);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
// See the LICENSE file in the project root for more information.

using System.Runtime.InteropServices;
using System.Text;

internal static partial class Interop
{
internal static partial class Normaliz
{
[DllImport("Normaliz.dll", CharSet = CharSet.Unicode, SetLastError = true)]
internal static extern bool IsNormalizedString(int normForm, string source, int length);
internal static extern unsafe BOOL IsNormalizedString(NormalizationForm normForm, char* source, int length);

[DllImport("Normaliz.dll", CharSet = CharSet.Unicode, SetLastError = true)]
internal static extern int NormalizeString(
int normForm,
string source,
internal static extern unsafe int NormalizeString(
NormalizationForm normForm,
char* source,
int sourceLength,
[System.Runtime.InteropServices.OutAttribute]
char[]? destination,
char* destination,
int destinationLength);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;

namespace System.Globalization
Expand All @@ -20,7 +22,14 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati

ValidateArguments(strInput, normalizationForm);

int ret = Interop.Globalization.IsNormalized(normalizationForm, strInput, strInput.Length);
int ret;
unsafe
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
{
fixed (char* pInput = strInput)
{
ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, strInput.Length);
}
}

if (ret == -1)
{
Expand All @@ -41,26 +50,60 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio

ValidateArguments(strInput, normalizationForm);

char[] buf = new char[strInput.Length];

for (int attempts = 2; attempts > 0; attempts--)
char[]? toReturn = null;
try
{
int realLen = Interop.Globalization.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length);
Span<char> buffer = strInput.Length <= 512
? stackalloc char[512]
: (toReturn = ArrayPool<char>.Shared.Rent(strInput.Length));

if (realLen == -1)
for (int attempt = 0; attempt < 2; attempt++)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
int realLen;
unsafe
{
fixed (char* pInput = strInput)
fixed (char* pDest = &MemoryMarshal.GetReference(buffer))
{
realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, strInput.Length, pDest, buffer.Length);
}
}

if (realLen == -1)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}

if (realLen <= buffer.Length)
{
return new string(buffer.Slice(0, realLen));
}

Debug.Assert(realLen > 512);
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved

if (attempt == 0)
{
if (toReturn != null)
{
// Clear toReturn first to ensure we don't return the same buffer twice
char[] temp = toReturn;
toReturn = null;
ArrayPool<char>.Shared.Return(temp);
}

buffer = toReturn = ArrayPool<char>.Shared.Rent(realLen);
}
}

if (realLen <= buf.Length)
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}
finally
{
if (toReturn != null)
{
return new string(buf, 0, realLen);
ArrayPool<char>.Shared.Return(toReturn);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heads up: we generally don't return buffers to the shared pool inside finally blocks. The sole exception is where we're in 100% control of every single code path that might be invoked inside the preceding try.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heads up: we generally don't return buffers to the shared pool inside finally blocks. The sole exception is where we're in 100% control of every single code path that might be invoked inside the preceding try.

We're inconsistent on this, e.g.

private async Task CopyToAsyncInternal(Stream destination, int bufferSize, CancellationToken cancellationToken)
{
byte[] buffer = ArrayPool<byte>.Shared.Rent(bufferSize);
try
{
while (true)
{
int bytesRead = await ReadAsync(new Memory<byte>(buffer), cancellationToken).ConfigureAwait(false);
if (bytesRead == 0) break;
await destination.WriteAsync(new ReadOnlyMemory<byte>(buffer, 0, bytesRead), cancellationToken).ConfigureAwait(false);
}
}
finally
{
ArrayPool<byte>.Shared.Return(buffer);
}
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, but we should try to get it under check for new code (like this). The GitHub UI is misbehaving for me so I can't see the rest of the code. So this usage might not be problematic at all but somebody should double check.

}

buf = new char[realLen];
}

throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}

// -----------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
Expand All @@ -24,7 +25,14 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati
// The only way to know if IsNormalizedString failed is through checking the Win32 last error
// IsNormalizedString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
bool result = Interop.Normaliz.IsNormalizedString((int)normalizationForm, strInput, strInput.Length);
Interop.BOOL result;
unsafe
{
fixed (char* pInput = strInput)
{
result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, strInput.Length);
}
}

int lastError = Marshal.GetLastWin32Error();
switch (lastError)
Expand All @@ -51,7 +59,7 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}

return result;
return result == Interop.BOOL.TRUE;
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
}

internal static string Normalize(string strInput, NormalizationForm normalizationForm)
Expand All @@ -65,84 +73,86 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio

Debug.Assert(strInput != null);

// we depend on Win32 last error when calling NormalizeString
// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.

// Guess our buffer size first
int iLength = Interop.Normaliz.NormalizeString((int)normalizationForm, strInput, strInput.Length, null, 0);
if (strInput.Length == 0)
{
return string.Empty;
}

int lastError = Marshal.GetLastWin32Error();
// Could have an error (actually it'd be quite hard to have an error here)
if ((lastError != Interop.Errors.ERROR_SUCCESS) || iLength < 0)
char[]? toReturn = null;
try
{
if (lastError == Interop.Errors.ERROR_INVALID_PARAMETER)
Span<char> buffer = strInput.Length <= 512
? stackalloc char[512]
: (toReturn = ArrayPool<char>.Shared.Rent(strInput.Length));

while (true)
{
if (normalizationForm != NormalizationForm.FormC &&
normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC &&
normalizationForm != NormalizationForm.FormKD)
// we depend on Win32 last error when calling NormalizeString
// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
int realLength;
unsafe
{
throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
fixed (char* pInput = strInput)
fixed (char* pDest = &MemoryMarshal.GetReference(buffer))
{
realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, strInput.Length, pDest, buffer.Length);
}
}
int lastError = Marshal.GetLastWin32Error();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this valid even if realLength > 0?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I see the above comment now)


throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
switch (lastError)
{
case Interop.Errors.ERROR_SUCCESS:
if (realLength == 0)
{
return string.Empty;
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
}
return new string(buffer.Slice(0, realLength));

// Do appropriate stuff for the individual errors:
case Interop.Errors.ERROR_INSUFFICIENT_BUFFER:
realLength = Math.Abs(realLength);
Debug.Assert(realLength > buffer.Length, "Buffer overflow should have iLength > cBuffer.Length");
if (toReturn != null)
{
// Clear toReturn first to ensure we don't return the same buffer twice
char[] temp = toReturn;
toReturn = null;
ArrayPool<char>.Shared.Return(temp);
}
buffer = toReturn = ArrayPool<char>.Shared.Rent(realLength);
continue;

case Interop.Errors.ERROR_INVALID_PARAMETER:
case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION:
if (normalizationForm != NormalizationForm.FormC &&
normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC &&
normalizationForm != NormalizationForm.FormKD)
{
throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
}

// Illegal code point or order found. Ie: FFFE or D800 D800, etc.
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));

case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY:
throw new OutOfMemoryException();

default:
// We shouldn't get here...
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}
}

// We shouldn't really be able to get here..., guessing length is
// a trivial math function...
// Can't really be Out of Memory, but just in case:
if (lastError == Interop.Errors.ERROR_NOT_ENOUGH_MEMORY)
throw new OutOfMemoryException();

// Who knows what happened? Not us!
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}

// Don't break for empty strings (only possible for D & KD and not really possible at that)
if (iLength == 0) return string.Empty;

// Someplace to stick our buffer
char[] cBuffer;

while (true)
finally
{
// (re)allocation buffer and normalize string
cBuffer = new char[iLength];

// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
iLength = Interop.Normaliz.NormalizeString((int)normalizationForm, strInput, strInput.Length, cBuffer, cBuffer.Length);
lastError = Marshal.GetLastWin32Error();

if (lastError == Interop.Errors.ERROR_SUCCESS)
break;

// Could have an error (actually it'd be quite hard to have an error here)
switch (lastError)
if (toReturn != null)
{
// Do appropriate stuff for the individual errors:
case Interop.Errors.ERROR_INSUFFICIENT_BUFFER:
iLength = Math.Abs(iLength);
Debug.Assert(iLength > cBuffer.Length, "Buffer overflow should have iLength > cBuffer.Length");
continue;

case Interop.Errors.ERROR_INVALID_PARAMETER:
case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION:
// Illegal code point or order found. Ie: FFFE or D800 D800, etc.
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));

case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY:
throw new OutOfMemoryException();

default:
// We shouldn't get here...
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
ArrayPool<char>.Shared.Return(toReturn);
}
}

// Copy our buffer into our new string, which will be the appropriate size
return new string(cBuffer, 0, iLength);
}
}
}