diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index 0838134b19c6d..4abe63e05ad83 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -31,7 +31,7 @@ protected Array(ArrayData data) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public ArrowBuffer NullBitmapBuffer => Data.Buffers[0]; diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs index 55d77f598c4e4..cdb6ed6b39418 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs @@ -15,7 +15,6 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; -using Google.FlatBuffers; using System; using System.Collections.Generic; using System.Linq; @@ -28,12 +27,30 @@ public sealed class ArrayData : IDisposable public readonly IArrowType DataType; public readonly int Length; - public readonly int NullCount; + + /// + /// The number of null values in the Array. May be -1 if the null count has not been computed. + /// + public int NullCount; + public readonly int Offset; public readonly ArrowBuffer[] Buffers; public readonly ArrayData[] Children; public readonly ArrayData Dictionary; // Only used for dictionary type + /// + /// Get the number of null values in the Array, computing the count if required. + /// + public int GetNullCount() + { + if (NullCount == RecalculateNullCount) + { + NullCount = ComputeNullCount(); + } + + return NullCount; + } + // This is left for compatibility with lower version binaries // before the dictionary type was supported. public ArrayData( @@ -111,7 +128,25 @@ public ArrayData Slice(int offset, int length) length = Math.Min(Length - offset, length); offset += Offset; - return new ArrayData(DataType, length, RecalculateNullCount, offset, Buffers, Children, Dictionary); + int nullCount; + if (NullCount == 0) + { + nullCount = 0; + } + else if (NullCount == Length) + { + nullCount = length; + } + else if (offset == Offset && length == Length) + { + nullCount = NullCount; + } + else + { + nullCount = RecalculateNullCount; + } + + return new ArrayData(DataType, length, nullCount, offset, Buffers, Children, Dictionary); } public ArrayData Clone(MemoryAllocator allocator = default) @@ -125,5 +160,24 @@ public ArrayData Clone(MemoryAllocator allocator = default) Children?.Select(b => b.Clone(allocator))?.ToArray(), Dictionary?.Clone(allocator)); } + + private int ComputeNullCount() + { + if (DataType.TypeId == ArrowTypeId.Union) + { + return UnionArray.ComputeNullCount(this); + } + + if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty) + { + return 0; + } + + // Note: Dictionary arrays may be logically null if there is a null in the dictionary values, + // but this isn't accounted for by the IArrowArray.IsNull implementation, + // so we maintain consistency with that behaviour here. + + return Length - BitUtility.CountBits(Buffers[0].Span, Offset, Length); + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 698d74e4bac84..84658a5fab812 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -71,7 +71,7 @@ public ArrayDataConcatenationVisitor(IReadOnlyList arrayDataList, Mem foreach (ArrayData arrayData in _arrayDataList) { _totalLength += arrayData.Length; - _totalNullCount += arrayData.NullCount; + _totalNullCount += arrayData.GetNullCount(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs index b6b61c560e482..459a30e22115f 100644 --- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -53,5 +53,28 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(ValueOffsets[index]); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var valueOffsets = data.Buffers[1].Span.CastTo().Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < length; ++i) + { + var typeId = typeIds[i]; + var valueOffset = valueOffsets[i]; + nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/NullArray.cs b/csharp/src/Apache.Arrow/Arrays/NullArray.cs index 762540065c929..7f3e183829243 100644 --- a/csharp/src/Apache.Arrow/Arrays/NullArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/NullArray.cs @@ -95,7 +95,7 @@ public NullArray(int length) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public void Dispose() { } public bool IsNull(int index) => true; diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs index 07d36e25cfc23..ef55786f01a4a 100644 --- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -47,5 +47,26 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(index); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < data.Length; ++i) + { + var typeId = typeIds[i]; + nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 5fcb276655162..f96f527135351 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -41,7 +41,7 @@ public abstract class UnionArray : IArrowArray public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public bool IsValid(int index) => NullCount == 0 || FieldIsValid(Fields[TypeIds[index]], index); @@ -91,6 +91,16 @@ protected static void ValidateMode(UnionMode expected, UnionMode actual) } } + internal static int ComputeNullCount(ArrayData data) + { + return ((UnionType)data.DataType).Mode switch + { + UnionMode.Sparse => SparseUnionArray.ComputeNullCount(data), + UnionMode.Dense => DenseUnionArray.ComputeNullCount(data), + _ => throw new InvalidOperationException("unknown union mode in null count computation") + }; + } + private IReadOnlyList InitializeFields() { IArrowArray[] result = new IArrowArray[Data.Children.Length]; diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 03059eaf5d4df..b241fdfea3bda 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -115,7 +115,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr { cArray->length = array.Length; cArray->offset = array.Offset; - cArray->null_count = array.NullCount; + cArray->null_count = array.NullCount; // The C Data interface allows the null count to be -1 cArray->release = ReleaseArrayPtr; cArray->private_data = MakePrivateData(sharedOwner); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b002f8c8b1578..7b319b03d790c 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -376,7 +376,7 @@ private void CreateSelfAndChildrenFieldNodes(ArrayData data) CreateSelfAndChildrenFieldNodes(data.Children[i]); } } - Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.NullCount); + Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.GetNullCount()); } private static int CountAllNodes(IReadOnlyList fields) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index a0e90cbbc7c61..682ebec323dc0 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -185,6 +185,7 @@ public void SlicePrimitiveArrayWithNulls() TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.AppendNull().AppendNull().AppendNull()); // All nulls static void TestNumberSlice() where T : struct, INumber @@ -314,6 +315,8 @@ private void ValidateArrays(PrimitiveArray slicedArray) .SequenceEqual(slicedArray.Values)); Assert.Equal(baseArray.GetValue(slicedArray.Offset), slicedArray.GetValue(0)); + + ValidateNullCount(slicedArray); } private void ValidateArrays(BooleanArray slicedArray) @@ -333,6 +336,8 @@ private void ValidateArrays(BooleanArray slicedArray) #pragma warning disable CS0618 Assert.Equal(baseArray.GetBoolean(slicedArray.Offset), slicedArray.GetBoolean(0)); #pragma warning restore CS0618 + + ValidateNullCount(slicedArray); } private void ValidateArrays(BinaryArray slicedArray) @@ -347,6 +352,16 @@ private void ValidateArrays(BinaryArray slicedArray) .SequenceEqual(slicedArray.ValueOffsets)); Assert.True(baseArray.GetBytes(slicedArray.Offset).SequenceEqual(slicedArray.GetBytes(0))); + + ValidateNullCount(slicedArray); + } + + private static void ValidateNullCount(IArrowArray slicedArray) + { + var expectedNullCount = Enumerable.Range(0, slicedArray.Length) + .Select(i => slicedArray.IsNull(i) ? 1 : 0) + .Sum(); + Assert.Equal(expectedNullCount, slicedArray.NullCount); } } } diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 1fb5cf2415c68..45fed722a745c 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -25,6 +25,46 @@ public class UnionArrayTests [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] public void UnionArray_IsNull(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 100); + + for (var i = 0; i < array.Length; ++i) + { + Assert.Equal(expectedNull[i], array.IsNull(i)); + Assert.Equal(!expectedNull[i], array.IsValid(i)); + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArray_Slice(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 10); + + for (var offset = 0; offset < array.Length; ++offset) + { + for (var length = 0; length < array.Length - offset; ++length) + { + var slicedArray = ArrowArrayFactory.Slice(array, offset, length); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + // TODO: Shouldn't need to add offset in IsNull/IsValid calls, + // see https://github.com/apache/arrow/issues/41140 + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(offset + i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(offset + i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + } + + Assert.True(nullCount == slicedArray.NullCount, $"offset = {offset}, length = {length}"); + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + } + + private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length) { var fields = new Field[] { @@ -34,7 +74,6 @@ public void UnionArray_IsNull(UnionMode mode) var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray(); var type = new UnionType(fields, typeIds, mode); - const int length = 100; var nullCount = 0; var field0Builder = new Int32Array.Builder(); var field1Builder = new FloatArray.Builder(); @@ -44,7 +83,7 @@ public void UnionArray_IsNull(UnionMode mode) for (var i = 0; i < length; ++i) { - var isNull = i % 5 == 0; + var isNull = i % 3 == 0; expectedNull[i] = isNull; nullCount += isNull ? 1 : 0; @@ -104,10 +143,6 @@ public void UnionArray_IsNull(UnionMode mode) ? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount) : new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount); - for (var i = 0; i < length; ++i) - { - Assert.Equal(expectedNull[i], array.IsNull(i)); - Assert.Equal(!expectedNull[i], array.IsValid(i)); - } + return (array, expectedNull); } }