From fb56f80bd07d0d122286bbfe3848dfcce5979a6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Wed, 30 Jun 2021 19:35:13 +0200 Subject: [PATCH 1/2] Use inline Vector128.Create for constants --- .../ServerInfrastructure/StringUtilities.cs | 46 ++++++++----------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 87fb890d5da6..5c3746de3072 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -132,13 +132,13 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count Debug.Assert((long)end >= Vector256.Count); // PERF: so the JIT can reuse the zero from a register - Vector128 zero = Vector128.Zero; + var zero = Vector128.Zero; if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256.Count) { - Vector256 avxZero = Vector256.Zero; + var avxZero = Vector256.Zero; do { @@ -233,8 +233,8 @@ out Unsafe.AsRef>(output), // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { - Vector128 vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + var vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); + var vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Sse2.Store((ulong*)output, vecWide); } else @@ -570,8 +570,8 @@ private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* outp // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { - Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + var vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); + var vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); } else @@ -598,8 +598,8 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { - Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + var vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + var vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Sse2.X64.ConvertToUInt64(vecWide); } @@ -637,8 +637,8 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.IsSupported) { - Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt32(); + var vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + var vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt32(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Sse2.ConvertToUInt32(vecWide); } @@ -725,34 +725,24 @@ private static void PopulateSpanWithHexSuffix(Span buffer, (string? str, c if (Ssse3.IsSupported) { - // These must be explicity typed as ReadOnlySpan - // They then become a non-allocating mappings to the data section of the assembly. - // This uses C# compiler's ability to refer to static data directly. For more information see https://vcsjones.dev/2019/02/01/csharp-readonly-span-bytes-static - ReadOnlySpan shuffleMaskData = new byte[16] - { + var lowNibbles = Ssse3.Shuffle(Vector128.CreateScalarUnsafe(tupleNumber).AsByte(), Vector128.Create( 0xF, 0xF, 3, 0xF, 0xF, 0xF, 2, 0xF, 0xF, 0xF, 1, 0xF, 0xF, 0xF, 0, 0xF - }; + ).AsByte()); - ReadOnlySpan asciiUpperCaseData = new byte[16] - { + var highNibbles = Sse2.ShiftRightLogical(Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte(); + var indices = Sse2.And(Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF)); + + // Lookup the hex values at the positions of the indices + var hex = Ssse3.Shuffle(Vector128.Create( (byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5', (byte)'6', (byte)'7', (byte)'8', (byte)'9', (byte)'A', (byte)'B', (byte)'C', (byte)'D', (byte)'E', (byte)'F' - }; - - // Load from data section memory into Vector128 registers - var shuffleMask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(shuffleMaskData)); - var asciiUpperCase = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(asciiUpperCaseData)); + ), indices); - var lowNibbles = Ssse3.Shuffle(Vector128.CreateScalarUnsafe(tupleNumber).AsByte(), shuffleMask); - var highNibbles = Sse2.ShiftRightLogical(Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte(); - var indices = Sse2.And(Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF)); - // Lookup the hex values at the positions of the indices - var hex = Ssse3.Shuffle(asciiUpperCase, indices); // The high bytes (0x00) of the chars have also been converted to ascii hex '0', so clear them out. hex = Sse2.And(hex, Vector128.Create((ushort)0xFF).AsByte()); From 8f1e949d7a5bb3f0d2f9109456f190162300fc1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Fri, 2 Jul 2021 10:27:14 +0200 Subject: [PATCH 2/2] Add comment --- src/Shared/ServerInfrastructure/StringUtilities.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 5c3746de3072..fa802f8d66a9 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -725,6 +725,9 @@ private static void PopulateSpanWithHexSuffix(Span buffer, (string? str, c if (Ssse3.IsSupported) { + // The constant inline vectors are read from the data section without any additional + // moves. See https://github.com/dotnet/runtime/issues/44115 Case 1.1 for further details. + var lowNibbles = Ssse3.Shuffle(Vector128.CreateScalarUnsafe(tupleNumber).AsByte(), Vector128.Create( 0xF, 0xF, 3, 0xF, 0xF, 0xF, 2, 0xF,