Skip to content

Commit

Permalink
PrimitiveDataFrameColumn.Clone method crashes when is used with IEnum…
Browse files Browse the repository at this point in the history
…erable mapIndices argument (#6822)

* Split Test for AppendMany into 4 different tests

* Block init of null validity buffer instead of setting individual bits

* Add unit tests for PrimitiveDataFrameColumn.Clone

* Fixes #6821

* Fix

* Fix bug with AppendMany values to not empty column

* Restart unit tests

* Add more unit tests

* Fix failing unit test

* Fix code review findings
  • Loading branch information
asmirnov82 authored Sep 27, 2023
1 parent 97926a8 commit 7fe293d
Show file tree
Hide file tree
Showing 3 changed files with 381 additions and 47 deletions.
124 changes: 106 additions & 18 deletions src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ namespace Microsoft.Data.Analysis
{
internal static class BitmapHelper
{
private static ReadOnlySpan<byte> BitMask => new byte[] {
1, 2, 4, 8, 16, 32, 64, 128
};

// Faster to use when we already have a span since it avoids indexing
public static bool IsValid(ReadOnlySpan<byte> bitMapBufferSpan, int index)
{
Expand All @@ -26,6 +30,98 @@ public static bool IsBitSet(byte curBitMap, int index)
{
return ((curBitMap >> (index & 7)) & 1) != 0;
}

public static bool IsBitClear(byte curBitMap, int index)
{
return ((curBitMap >> (index & 7)) & 1) == 0;
}

public static bool GetBit(byte data, int index) =>
((data >> index) & 1) != 0;

public static bool GetBit(ReadOnlySpan<byte> data, int index) =>
(data[index / 8] & BitMask[index % 8]) != 0;

public static void ClearBit(Span<byte> data, int index)
{
data[index / 8] &= (byte)~BitMask[index % 8];
}

public static void SetBit(Span<byte> data, int index)
{
data[index / 8] |= BitMask[index % 8];
}

public static void SetBit(Span<byte> data, int index, bool value)
{
int idx = index / 8;
int mod = index % 8;
data[idx] = value
? (byte)(data[idx] | BitMask[mod])
: (byte)(data[idx] & ~BitMask[mod]);
}

/// <summary>
/// Set the number of bits in a span of bytes starting
/// at a specific index, and limiting to length.
/// </summary>
/// <param name="data">Span to set bits value.</param>
/// <param name="index">Bit index to start counting from.</param>
/// <param name="length">Maximum of bits in the span to consider.</param>
/// <param name="value">Bit value.</param>
internal static void SetBits(Span<byte> data, int index, int length, bool value)
{
if (length == 0)
return;

int endBitIndex = checked(index + length - 1);

// Use simpler method if there aren't many values
if (length < 20)
{
for (int i = index; i <= endBitIndex; i++)
{
SetBit(data, i, value);
}
return;
}

// Otherwise do the work to figure out how to copy whole bytes
int startByteIndex = index / 8;
int startBitOffset = index % 8;
int endByteIndex = endBitIndex / 8;
int endBitOffset = endBitIndex % 8;

// If the starting index and ending index are not byte-aligned,
// we'll need to set bits the slow way. If they are
// byte-aligned, and for all other bytes in the 'middle', we
// can use a faster byte-aligned set.
int fullByteStartIndex = startBitOffset == 0 ? startByteIndex : startByteIndex + 1;
int fullByteEndIndex = endBitOffset == 7 ? endByteIndex : endByteIndex - 1;

// Bits we will be using to finish up the first byte
if (startBitOffset != 0)
{
Span<byte> slice = data.Slice(startByteIndex, 1);
for (int i = startBitOffset; i <= 7; i++)
SetBit(slice, i, value);
}

if (fullByteEndIndex >= fullByteStartIndex)
{
Span<byte> slice = data.Slice(fullByteStartIndex, fullByteEndIndex - fullByteStartIndex + 1);
byte fill = (byte)(value ? 0xFF : 0x00);

slice.Fill(fill);
}

if (endBitOffset != 7)
{
Span<byte> slice = data.Slice(endByteIndex, 1);
for (int i = 0; i <= endBitOffset; i++)
SetBit(slice, i, value);
}
}
}

/// <summary>
Expand All @@ -41,9 +137,6 @@ internal partial class PrimitiveColumnContainer<T> : IEnumerable<T?>
// A set bit implies a valid value. An unset bit => null value
public IList<ReadOnlyDataFrameBuffer<byte>> NullBitMapBuffers = new List<ReadOnlyDataFrameBuffer<byte>>();

// Need a way to differentiate between columns initialized with default values and those with null values in SetValidityBit
internal bool _modifyNullCountWhileIndexing = true;

public PrimitiveColumnContainer(IEnumerable<T> values)
{
values = values ?? throw new ArgumentNullException(nameof(values));
Expand Down Expand Up @@ -168,27 +261,22 @@ public void AppendMany(T? value, long count)
}

DataFrameBuffer<T> mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1);
DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);

//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
int originalBufferLength = mutableLastBuffer.Length;
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - originalBufferLength);
mutableLastBuffer.IncreaseSize(allocatable);

DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
int nullBufferAllocatable = (allocatable + 7) / 8;
//Calculate how many bytes we have additionaly allocate to store allocatable number of bits (need to take into account unused bits inside already allocated bytes)
int nullBufferAllocatable = (originalBufferLength + allocatable + 7) / 8 - lastNullBitMapBuffer.Length;
lastNullBitMapBuffer.IncreaseSize(nullBufferAllocatable);

Length += allocatable;

if (value.HasValue)
{
mutableLastBuffer.RawSpan.Slice(mutableLastBuffer.Length - allocatable, allocatable).Fill(value ?? default);

_modifyNullCountWhileIndexing = false;
for (long i = Length - allocatable; i < Length; i++)
{
SetValidityBit(i, value.HasValue);
}
_modifyNullCountWhileIndexing = true;
mutableLastBuffer.RawSpan.Slice(mutableLastBuffer.Length - allocatable, allocatable).Fill(value.Value);
BitmapHelper.SetBits(lastNullBitMapBuffer.RawSpan, originalBufferLength, allocatable, true);
}

remaining -= allocatable;
Expand Down Expand Up @@ -247,20 +335,20 @@ private byte SetBit(byte curBitMap, int index, bool value)
if (value)
{
newBitMap = (byte)(curBitMap | (byte)(1 << (index & 7))); //bit hack for index % 8
if (_modifyNullCountWhileIndexing && ((curBitMap >> (index & 7)) & 1) == 0 && index < Length && NullCount > 0)
if (BitmapHelper.IsBitClear(curBitMap, index) && index < Length && NullCount > 0)
{
// Old value was null.
NullCount--;
}
}
else
{
if (_modifyNullCountWhileIndexing && ((curBitMap >> (index & 7)) & 1) == 1 && index < Length)
if (BitmapHelper.IsBitSet(curBitMap, index) && index < Length)
{
// old value was NOT null and new value is null
NullCount++;
}
else if (_modifyNullCountWhileIndexing && index == Length)
else if (index == Length)
{
// New entry from an append
NullCount++;
Expand Down
18 changes: 10 additions & 8 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.InteropServices;
using Apache.Arrow;
using Apache.Arrow.Types;
Expand Down Expand Up @@ -473,22 +474,23 @@ public PrimitiveDataFrameColumn<T> Clone(PrimitiveDataFrameColumn<int> mapIndice
public PrimitiveDataFrameColumn<T> Clone(IEnumerable<long> mapIndices)
{
IEnumerator<long> rows = mapIndices.GetEnumerator();
PrimitiveDataFrameColumn<T> ret = new PrimitiveDataFrameColumn<T>(Name);
ret._columnContainer._modifyNullCountWhileIndexing = false;
PrimitiveDataFrameColumn<T> ret = CreateNewColumn(Name);
long numberOfRows = 0;
while (rows.MoveNext() && numberOfRows < Length)
{
numberOfRows++;
long curRow = rows.Current;
T? value = _columnContainer[curRow];
ret[curRow] = value;
if (!value.HasValue)
ret._columnContainer.NullCount++;
var curRow = rows.Current;
var value = _columnContainer[curRow];
ret.Append(value);
}
ret._columnContainer._modifyNullCountWhileIndexing = true;
return ret;
}

public PrimitiveDataFrameColumn<T> Clone(IEnumerable<int> mapIndices)
{
return Clone(mapIndices.Select(x => (long)x));
}

internal BooleanDataFrameColumn CloneAsBooleanColumn()
{
PrimitiveColumnContainer<bool> newColumnContainer = _columnContainer.CloneAsBoolContainer();
Expand Down
Loading

0 comments on commit 7fe293d

Please sign in to comment.