Skip to content

Commit

Permalink
Improve XmlDictionaryWriter UTF8 encoding performance (#73336)
Browse files Browse the repository at this point in the history
* Speed up text encoding

* Update implementation

* Add tests for binary xml strings

* limit counting code to 256 bit vectors

* reword comment

* rename test

* move bytesmax

* Fix bytesMax after moving variable initialization

* use unicode escape value in test

* fix test typo "*" -> "+"

* Update src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs

Co-authored-by: Stephen Toub <stoub@microsoft.com>

* Remvoe vectorized code from UnsafeGetUTF8Length

* Fix overfload

* use for loop which seems faster

* remove vector loop

* make sealed encoding to allow devirtualisation

* back some changes

* use uint for UnsafeGetUTF8Chars comparison

* revert more changes

* Fix cutoff based on new measurements

* use BinaryPrimitives.ReverseEndianness as suggested

* Update cutoff from 24 to 32 chars before calling, due to regression for text based DataContractSerializer

* Remove sealed encoding since it only improves XmlConvert

---------

Co-authored-by: Stephen Toub <stoub@microsoft.com>
  • Loading branch information
Daniel-Svensson and stephentoub authored Apr 4, 2023
1 parent b54d6ef commit e0c94f8
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 41 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers.Binary;
using System.IO;
using System.Text;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
using System.Threading.Tasks;
using System.Diagnostics;
Expand Down Expand Up @@ -330,34 +332,26 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)
}
}

protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
protected static unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
if (BitConverter.IsLittleEndian)
{
char value = *chars++;
buffer[offset++] = (byte)value;
value >>= 8;
buffer[offset++] = (byte)value;
new ReadOnlySpan<char>(chars, charCount)
.CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset)));
}
else
{
BinaryPrimitives.ReverseEndianness(new ReadOnlySpan<short>(chars, charCount),
MemoryMarshal.Cast<byte, short>(buffer.AsSpan(offset)));
}

return charCount * 2;
}

protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
{
if (*chars >= 0x80)
break;

chars++;
}

if (chars == charsMax)
return charCount;

return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars));
// Length will always be at least ( 128 / maxBytesPerChar) = 42
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount);
}

protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
Expand All @@ -366,39 +360,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
{
fixed (byte* _bytes = &buffer[offset])
{
byte* bytes = _bytes;
byte* bytesMax = &bytes[buffer.Length - offset];
char* charsMax = &chars[charCount];

while (true)
// Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible
if ((uint)charCount < 32)
{
byte* bytes = _bytes;
char* charsMax = &chars[charCount];

while (chars < charsMax)
{
char t = *chars;
if (t >= 0x80)
break;
goto NonAscii;

*bytes = (byte)t;
bytes++;
chars++;
}
return charCount;

if (chars >= charsMax)
break;

char* charsStart = chars;
while (chars < charsMax && *chars >= 0x80)
{
chars++;
}

bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));

if (chars >= charsMax)
break;
NonAscii:
byte* bytesMax = _bytes + buffer.Length - offset;
return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
}
else
{
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset);
}

return (int)(bytes - _bytes);
}
}
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,71 @@ void AssertBytesWritten(Action<XmlDictionaryWriter> action, XmlBinaryNodeType no
}
}

[Fact]
public static void XmlBaseWriter_WriteString()
{
const byte Chars8Text = 152;
const byte Chars16Text = 154;
MemoryStream ms = new MemoryStream();
XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
writer.WriteStartElement("root");

int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
byte[] buffer = new byte[lengths.Max() + 1];

foreach (var length in lengths)
{
string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
});
string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
chars[^1] = '\u00E4'; // 'ä' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
});

int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
Assert.True(numBytes == length, "Test setup wrong - allAscii");
ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));

numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
}

static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
{
writer.Flush();
ms.Seek(0, SeekOrigin.Begin);
ms.SetLength(0);
writer.WriteString(text);
writer.Flush();

ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
ReadOnlySpan<byte> buffer = arraySegment;

if (expected.Length <= byte.MaxValue)
{
Assert.Equal(Chars8Text, buffer[0]);
Assert.Equal(expected.Length, buffer[1]);
buffer = buffer.Slice(2);
}
else if (expected.Length <= ushort.MaxValue)
{
Assert.Equal(Chars16Text, buffer[0]);
Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
buffer = buffer.Slice(3);
}
else
Assert.Fail("test use to long length");

AssertExtensions.SequenceEqual(expected, buffer);
}
}

private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
{
ms.Position = 0;
Expand Down

0 comments on commit e0c94f8

Please sign in to comment.