diff --git a/src/libraries/System.Collections/src/System/Collections/BitArray.cs b/src/libraries/System.Collections/src/System/Collections/BitArray.cs index 9eed0c8150c43..c2811a3000c25 100644 --- a/src/libraries/System.Collections/src/System/Collections/BitArray.cs +++ b/src/libraries/System.Collections/src/System/Collections/BitArray.cs @@ -190,6 +190,8 @@ public unsafe BitArray(bool[] values) // Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction // As a workaround, mask out the relevant bit after comparison // and combine by ORing all of them together (In this case, adding all of them does the same thing) + + // Future opportunity: those two loads can be combined into a single `LDP` (See dotnet/runtime#37014) Vector128 lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); Vector128 lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); Vector128 bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128); @@ -634,7 +636,7 @@ public unsafe BitArray Not() uint i = 0; if (Avx2.IsSupported) { - Vector256 ones = Vector256.Create(-1); + Vector256 ones = Vector256.AllBitsSet; fixed (int* ptr = thisArray) { for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) @@ -646,7 +648,7 @@ public unsafe BitArray Not() } else if (Sse2.IsSupported) { - Vector128 ones = Vector128.Create(-1); + Vector128 ones = Vector128.AllBitsSet; fixed (int* ptr = thisArray) { for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) @@ -839,11 +841,14 @@ public int Length } } - // The mask used when shuffling a single int into Vector128/256. + // The shuffle control mask used when shuffling a single int into Vector128/256. // On little endian machines, the lower 8 bits of int belong in the first byte, next lower 8 in the second and so on. + // (And on big endian, upper 8 bits in the first byte, next upper 8 bits in the second...) // We place the bytes that contain the bits to its respective byte so that we can mask out only the relevant bits later. - private static readonly Vector128 s_lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte(); - private static readonly Vector128 s_upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte(); + private static readonly Vector128 s_lowerShuffleMask_CopyToBoolArray = + BitConverter.IsLittleEndian ? Vector128.Create(0, 0x01010101_01010101).AsByte() : Vector128.Create(0x03030303_03030303, 0x02020202_02020202).AsByte(); + private static readonly Vector128 s_upperShuffleMask_CopyToBoolArray = + BitConverter.IsLittleEndian ? Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte() : Vector128.Create(0x01010101_01010101, 0).AsByte(); public unsafe void CopyTo(Array array, int index) { @@ -985,36 +990,23 @@ public unsafe void CopyTo(Array array, int index) else if (AdvSimd.IsSupported) { Vector128 ones = Vector128.Create((byte)1); + Vector128 lowerIndices = s_lowerShuffleMask_CopyToBoolArray; + Vector128 upperIndices = s_upperShuffleMask_CopyToBoolArray; + fixed (bool* destination = &boolArray[index]) { + // Future opportunity: those two stores can be combined into a single `STP` (See dotnet/runtime#33532) for (; (i + Vector128ByteCount * 2u) <= (uint)m_length; i += Vector128ByteCount * 2u) { int bits = m_array[i / (uint)BitsPerInt32]; - // Same logic as SSSE3 path, except we do not have Shuffle instruction. - // (TableVectorLookup could be an alternative - dotnet/runtime#1277) - // Instead we use chained ZIP1/2 instructions: - // (A0 is the byte containing LSB, A3 is the byte containing MSB) - // bits (on Big endian) - A3 A2 A1 A0 - // bits (Little endian) / Byte reversal - A0 A1 A2 A3 - // v1 = Vector128.Create - A0 A1 A2 A3 A0 A1 A2 A3 A0 A1 A2 A3 A0 A1 A2 A3 - // v2 = ZipLow(v1, v1) - A0 A0 A1 A1 A2 A2 A3 A3 A0 A0 A1 A1 A2 A2 A3 A3 - // v3 = ZipLow(v2, v2) - A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 - // shuffledLower = ZipLow(v3, v3) - A0 A0 A0 A0 A0 A0 A0 A0 A1 A1 A1 A1 A1 A1 A1 A1 - // shuffledHigher = ZipHigh(v3, v3) - A2 A2 A2 A2 A2 A2 A2 A2 A3 A3 A3 A3 A3 A3 A3 A3 - if (!BitConverter.IsLittleEndian) - { - bits = BinaryPrimitives.ReverseEndianness(bits); - } - Vector128 vector = Vector128.Create(bits).AsByte(); - vector = AdvSimd.Arm64.ZipLow(vector, vector); - vector = AdvSimd.Arm64.ZipLow(vector, vector); - - Vector128 shuffledLower = AdvSimd.Arm64.ZipLow(vector, vector); + Vector128 vector = Vector128.CreateScalarUnsafe(bits).AsByte(); + + Vector128 shuffledLower = AdvSimd.Arm64.VectorTableLookup(vector, lowerIndices); Vector128 extractedLower = AdvSimd.And(shuffledLower, s_bitMask128); Vector128 normalizedLower = AdvSimd.Min(extractedLower, ones); AdvSimd.Store((byte*)destination + i, normalizedLower); - Vector128 shuffledHigher = AdvSimd.Arm64.ZipHigh(vector, vector); + Vector128 shuffledHigher = AdvSimd.Arm64.VectorTableLookup(vector, upperIndices); Vector128 extractedHigher = AdvSimd.And(shuffledHigher, s_bitMask128); Vector128 normalizedHigher = AdvSimd.Min(extractedHigher, ones); AdvSimd.Store((byte*)destination + i + Vector128.Count, normalizedHigher);