diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 8c9700fb0e..1824e595ae 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -14,6 +14,10 @@ namespace Microsoft.Data.Analysis { internal static class BitmapHelper { + private static ReadOnlySpan BitMask => new byte[] { + 1, 2, 4, 8, 16, 32, 64, 128 + }; + // Faster to use when we already have a span since it avoids indexing public static bool IsValid(ReadOnlySpan bitMapBufferSpan, int index) { @@ -26,6 +30,98 @@ public static bool IsBitSet(byte curBitMap, int index) { return ((curBitMap >> (index & 7)) & 1) != 0; } + + public static bool IsBitClear(byte curBitMap, int index) + { + return ((curBitMap >> (index & 7)) & 1) == 0; + } + + public static bool GetBit(byte data, int index) => + ((data >> index) & 1) != 0; + + public static bool GetBit(ReadOnlySpan data, int index) => + (data[index / 8] & BitMask[index % 8]) != 0; + + public static void ClearBit(Span data, int index) + { + data[index / 8] &= (byte)~BitMask[index % 8]; + } + + public static void SetBit(Span data, int index) + { + data[index / 8] |= BitMask[index % 8]; + } + + public static void SetBit(Span data, int index, bool value) + { + int idx = index / 8; + int mod = index % 8; + data[idx] = value + ? (byte)(data[idx] | BitMask[mod]) + : (byte)(data[idx] & ~BitMask[mod]); + } + + /// + /// Set the number of bits in a span of bytes starting + /// at a specific index, and limiting to length. + /// + /// Span to set bits value. + /// Bit index to start counting from. + /// Maximum of bits in the span to consider. + /// Bit value. + internal static void SetBits(Span data, int index, int length, bool value) + { + if (length == 0) + return; + + int endBitIndex = checked(index + length - 1); + + // Use simpler method if there aren't many values + if (length < 20) + { + for (int i = index; i <= endBitIndex; i++) + { + SetBit(data, i, value); + } + return; + } + + // Otherwise do the work to figure out how to copy whole bytes + int startByteIndex = index / 8; + int startBitOffset = index % 8; + int endByteIndex = endBitIndex / 8; + int endBitOffset = endBitIndex % 8; + + // If the starting index and ending index are not byte-aligned, + // we'll need to set bits the slow way. If they are + // byte-aligned, and for all other bytes in the 'middle', we + // can use a faster byte-aligned set. + int fullByteStartIndex = startBitOffset == 0 ? startByteIndex : startByteIndex + 1; + int fullByteEndIndex = endBitOffset == 7 ? endByteIndex : endByteIndex - 1; + + // Bits we will be using to finish up the first byte + if (startBitOffset != 0) + { + Span slice = data.Slice(startByteIndex, 1); + for (int i = startBitOffset; i <= 7; i++) + SetBit(slice, i, value); + } + + if (fullByteEndIndex >= fullByteStartIndex) + { + Span slice = data.Slice(fullByteStartIndex, fullByteEndIndex - fullByteStartIndex + 1); + byte fill = (byte)(value ? 0xFF : 0x00); + + slice.Fill(fill); + } + + if (endBitOffset != 7) + { + Span slice = data.Slice(endByteIndex, 1); + for (int i = 0; i <= endBitOffset; i++) + SetBit(slice, i, value); + } + } } /// @@ -41,9 +137,6 @@ internal partial class PrimitiveColumnContainer : IEnumerable // A set bit implies a valid value. An unset bit => null value public IList> NullBitMapBuffers = new List>(); - // Need a way to differentiate between columns initialized with default values and those with null values in SetValidityBit - internal bool _modifyNullCountWhileIndexing = true; - public PrimitiveColumnContainer(IEnumerable values) { values = values ?? throw new ArgumentNullException(nameof(values)); @@ -168,27 +261,22 @@ public void AppendMany(T? value, long count) } DataFrameBuffer mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1); + DataFrameBuffer lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1); //Calculate how many values we can additionaly allocate and not exceed the MaxCapacity - int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer.MaxCapacity - mutableLastBuffer.Length); + int originalBufferLength = mutableLastBuffer.Length; + int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer.MaxCapacity - originalBufferLength); mutableLastBuffer.IncreaseSize(allocatable); - DataFrameBuffer lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1); - int nullBufferAllocatable = (allocatable + 7) / 8; + //Calculate how many bytes we have additionaly allocate to store allocatable number of bits (need to take into account unused bits inside already allocated bytes) + int nullBufferAllocatable = (originalBufferLength + allocatable + 7) / 8 - lastNullBitMapBuffer.Length; lastNullBitMapBuffer.IncreaseSize(nullBufferAllocatable); - Length += allocatable; if (value.HasValue) { - mutableLastBuffer.RawSpan.Slice(mutableLastBuffer.Length - allocatable, allocatable).Fill(value ?? default); - - _modifyNullCountWhileIndexing = false; - for (long i = Length - allocatable; i < Length; i++) - { - SetValidityBit(i, value.HasValue); - } - _modifyNullCountWhileIndexing = true; + mutableLastBuffer.RawSpan.Slice(mutableLastBuffer.Length - allocatable, allocatable).Fill(value.Value); + BitmapHelper.SetBits(lastNullBitMapBuffer.RawSpan, originalBufferLength, allocatable, true); } remaining -= allocatable; @@ -247,7 +335,7 @@ private byte SetBit(byte curBitMap, int index, bool value) if (value) { newBitMap = (byte)(curBitMap | (byte)(1 << (index & 7))); //bit hack for index % 8 - if (_modifyNullCountWhileIndexing && ((curBitMap >> (index & 7)) & 1) == 0 && index < Length && NullCount > 0) + if (BitmapHelper.IsBitClear(curBitMap, index) && index < Length && NullCount > 0) { // Old value was null. NullCount--; @@ -255,12 +343,12 @@ private byte SetBit(byte curBitMap, int index, bool value) } else { - if (_modifyNullCountWhileIndexing && ((curBitMap >> (index & 7)) & 1) == 1 && index < Length) + if (BitmapHelper.IsBitSet(curBitMap, index) && index < Length) { // old value was NOT null and new value is null NullCount++; } - else if (_modifyNullCountWhileIndexing && index == Length) + else if (index == Length) { // New entry from an append NullCount++; diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 69eeb2d18b..375f064bbb 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -6,6 +6,7 @@ using System.Collections; using System.Collections.Generic; using System.Diagnostics; +using System.Linq; using System.Runtime.InteropServices; using Apache.Arrow; using Apache.Arrow.Types; @@ -473,22 +474,23 @@ public PrimitiveDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndice public PrimitiveDataFrameColumn Clone(IEnumerable mapIndices) { IEnumerator rows = mapIndices.GetEnumerator(); - PrimitiveDataFrameColumn ret = new PrimitiveDataFrameColumn(Name); - ret._columnContainer._modifyNullCountWhileIndexing = false; + PrimitiveDataFrameColumn ret = CreateNewColumn(Name); long numberOfRows = 0; while (rows.MoveNext() && numberOfRows < Length) { numberOfRows++; - long curRow = rows.Current; - T? value = _columnContainer[curRow]; - ret[curRow] = value; - if (!value.HasValue) - ret._columnContainer.NullCount++; + var curRow = rows.Current; + var value = _columnContainer[curRow]; + ret.Append(value); } - ret._columnContainer._modifyNullCountWhileIndexing = true; return ret; } + public PrimitiveDataFrameColumn Clone(IEnumerable mapIndices) + { + return Clone(mapIndices.Select(x => (long)x)); + } + internal BooleanDataFrameColumn CloneAsBooleanColumn() { PrimitiveColumnContainer newColumnContainer = _columnContainer.CloneAsBoolContainer(); diff --git a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs index 3a88e2eddc..16672818db 100644 --- a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs @@ -17,10 +17,6 @@ public class BufferTests [Fact] public void TestNullCounts() { - PrimitiveDataFrameColumn dataFrameColumn1 = new PrimitiveDataFrameColumn("Int1", Enumerable.Range(0, 10).Select(x => x)); - dataFrameColumn1.Append(null); - Assert.Equal(1, dataFrameColumn1.NullCount); - PrimitiveDataFrameColumn column2 = new PrimitiveDataFrameColumn("Int2"); Assert.Equal(0, column2.NullCount); @@ -65,20 +61,6 @@ public void TestNullCounts() Assert.Equal(1, strCol.NullCount); strCol[0] = null; Assert.Equal(1, strCol.NullCount); - - PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int"); - intColumn.Append(0); - intColumn.Append(1); - intColumn.Append(null); - intColumn.Append(2); - intColumn.Append(null); - intColumn.Append(3); - Assert.Equal(0, intColumn[0]); - Assert.Equal(1, intColumn[1]); - Assert.Null(intColumn[2]); - Assert.Equal(2, intColumn[3]); - Assert.Null(intColumn[4]); - Assert.Equal(3, intColumn[5]); } [Fact] @@ -105,9 +87,61 @@ public void TestValidity() } [Fact] - public void TestAppendMany() + public void TestAppendNullToEmptyColumn() { PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1"); + + //Act + intColumn.Append(null); + + Assert.Equal(1, intColumn.NullCount); + Assert.Equal(1, intColumn.Length); + + for (int i = 0; i < intColumn.Length; i++) + { + Assert.False(intColumn.IsValid(i)); + } + } + + [Fact] + public void TestAppendNullToColumnWithValues() + { + PrimitiveDataFrameColumn dataFrameColumn1 = new PrimitiveDataFrameColumn("Int1", Enumerable.Range(0, 10)); + dataFrameColumn1.Append(null); + Assert.Equal(1, dataFrameColumn1.NullCount); + Assert.Equal(11, dataFrameColumn1.Length); + Assert.Null(dataFrameColumn1[10]); + } + + [Fact] + public void TestAppendToColumnWithValues() + { + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int", Enumerable.Range(0, 10)); + + intColumn.Append(0); + intColumn.Append(1); + intColumn.Append(null); + intColumn.Append(2); + intColumn.Append(null); + intColumn.Append(3); + + Assert.Equal(16, intColumn.Length); + Assert.Equal(2, intColumn.NullCount); + + Assert.Equal(0, intColumn[10]); + Assert.Equal(1, intColumn[11]); + Assert.Null(intColumn[12]); + Assert.Equal(2, intColumn[13]); + Assert.Null(intColumn[14]); + Assert.Equal(3, intColumn[15]); + } + + [Fact] + public void TestAppendManyNullsToEmptyColumn() + { + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1"); + + //Act intColumn.AppendMany(null, 5); Assert.Equal(5, intColumn.NullCount); Assert.Equal(5, intColumn.Length); @@ -115,20 +149,93 @@ public void TestAppendMany() { Assert.False(intColumn.IsValid(i)); } + } + + [Fact] + public void TestAppendManyNullsToColumnWithValues() + { + //Arrange + var initialValues = new int?[] { 1, 2, null, 4, 5 }; + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1", initialValues); + + //Act + intColumn.AppendMany(null, 5); + + //Assert + Assert.Equal(6, intColumn.NullCount); + Assert.Equal(10, intColumn.Length); + + for (int i = 0; i < 5; i++) + { + Assert.Equal(initialValues[i], intColumn[i]); + } + + for (int i = 5; i < 10; i++) + { + Assert.False(intColumn.IsValid(i)); + } + } + + [Fact] + public void TestAppendManyValuesToEmptyColumn() + { + //Arrange + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1"); + //Act intColumn.AppendMany(5, 5); - Assert.Equal(5, intColumn.NullCount); + + //Assert + Assert.Equal(0, intColumn.NullCount); + Assert.Equal(5, intColumn.Length); + + for (int i = 0; i < intColumn.Length; i++) + { + Assert.Equal(5, intColumn[i]); + } + } + + [Fact] + public void TestAppendManyValuesToColumnWithValues() + { + //Arrange + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1", new int?[] { 1, 2, 3, null, null }); + + //Act + intColumn.AppendMany(5, 5); + + //Assert + Assert.Equal(2, intColumn.NullCount); Assert.Equal(10, intColumn.Length); + + Assert.Equal(3, intColumn[2]); + Assert.Null(intColumn[3]); + Assert.Null(intColumn[4]); + for (int i = 5; i < intColumn.Length; i++) { - Assert.True(intColumn.IsValid(i)); + Assert.Equal(5, intColumn[i]); } + } + + [Fact] + public void TestNullCountChange() + { + //Arrange + var initialValues = new int?[] { null, null, null, null, null, 5, 5, 5, 5, 5 }; + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1", initialValues); + //Act intColumn[2] = 10; + + //Assert Assert.Equal(4, intColumn.NullCount); Assert.True(intColumn.IsValid(2)); + //Act intColumn[7] = null; + + //Assert Assert.Equal(5, intColumn.NullCount); Assert.False(intColumn.IsValid(7)); } @@ -147,6 +254,143 @@ public void TestClone() Assert.Equal(intColumn[i], copy[i]); } + [Fact] + public void TestNotNullableColumnClone() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new[] { -1, 2, 3, 2, 1, -2 }); + + //Act + var clonedColumn = column.Clone(); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(column.DataType, clonedColumn.DataType); + Assert.Equal(column.NullCount, clonedColumn.NullCount); + Assert.Equal(column.Length, clonedColumn.Length); + + for (long i = 0; i < column.Length; i++) + Assert.Equal(column[i], clonedColumn[i]); + } + + [Fact] + public void TestNullableColumnClone() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new int?[] { -1, null, 3, 2, 1, -2 }); + + //Act + var clonedColumn = column.Clone(); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(column.DataType, clonedColumn.DataType); + Assert.Equal(column.NullCount, clonedColumn.NullCount); + Assert.Equal(column.Length, clonedColumn.Length); + + for (long i = 0; i < column.Length; i++) + Assert.Equal(column[i], clonedColumn[i]); + + } + + [Fact] + public void TestNotNullableColumnCloneWithIndicesMap() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new[] { 0, 5, 2, 4, 1, 3 }); + var indicesMap = new Int32DataFrameColumn("Indices", new[] { 0, 1, 2, 5, 3, 4 }); + + //Act + var clonedColumn = column.Clone(indicesMap); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(column.DataType, clonedColumn.DataType); + Assert.Equal(column.NullCount, clonedColumn.NullCount); + Assert.Equal(indicesMap.Length, clonedColumn.Length); + + for (int i = 0; i < indicesMap.Length; i++) + Assert.Equal(column[indicesMap[i].Value], clonedColumn[i]); + } + + [Fact] + public void TestNotNullableColumnCloneWithIndicesMapAsEnumerableLong() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new[] { 0, 5, 2, 4, 1, 3 }); + var indicesMap = new long[] { 0, 1, 2, 5, 3, 4 }; + + //Act + var clonedColumn = column.Clone(indicesMap); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(column.DataType, clonedColumn.DataType); + Assert.Equal(column.NullCount, clonedColumn.NullCount); + Assert.Equal(indicesMap.Length, clonedColumn.Length); + + for (int i = 0; i < indicesMap.Length; i++) + Assert.Equal(column[indicesMap[i]], clonedColumn[i]); + } + + [Fact] + public void TestNotNullableColumnCloneWithIndicesMapAsEnumerableInt() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new[] { 0, 5, 2, 4, 1, 3 }); + var indicesMap = new int[] { 0, 1, 2, 5, 3, 4 }; + + //Act + var clonedColumn = column.Clone(indicesMap); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(column.DataType, clonedColumn.DataType); + Assert.Equal(column.NullCount, clonedColumn.NullCount); + Assert.Equal(indicesMap.Length, clonedColumn.Length); + + for (int i = 0; i < indicesMap.Length; i++) + Assert.Equal(column[indicesMap[i]], clonedColumn[i]); + } + + + [Fact] + public void TestNullableColumnCloneWithIndicesMapAndSmallerSize() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new int?[] { null, 5, 2, 4, 1, 3 }); + var indicesMap = new Int32DataFrameColumn("Indices", new[] { 0, 4, 2, 5, 3 }); + + //Act + var clonedColumn = column.Clone(indicesMap); + + //Assert + Assert.NotSame(column, clonedColumn); + Assert.Equal(column.Name, clonedColumn.Name); + Assert.Equal(indicesMap.Length, clonedColumn.Length); + Assert.Equal(column.DataType, clonedColumn.DataType); + + for (int i = 0; i < indicesMap.Length; i++) + Assert.Equal(indicesMap.IsValid(i) ? column[indicesMap[i].Value] : null, clonedColumn[i]); + } + + [Fact] + public void TestNullableColumnCloneWithIndicesMap_OutOfRange() + { + //Arrange + var column = new Int32DataFrameColumn("Int column", values: new int?[] { null, 1, 1 }); + var indicesMap = new Int32DataFrameColumn("Indices", new[] { 0, 1, 4 }); + + //Act and assert + Assert.Throws(() => column.Clone(indicesMap)); + } + + [Fact] public void TestBasicArrowStringColumn() {