Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensuring delta encoding footer blocks are complete And Handle Overflow #387

Merged
merged 2 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions src/Parquet.Test/Encodings/DeltaBinaryPackedEncodingTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,40 @@ public void EncodeAndDecodeInt64_1_100000() {

Assert.Equal(input, des);
}

[Fact]
public void EncodeAndDecodeInt32_Random_Overflow() {
var r = new Random(0);
int total = 1000;
int[] input = Enumerable.Range(0, total).Select(i => r.Next(int.MinValue, int.MaxValue)).ToArray();

using var ms = new MemoryStream();
DeltaBinaryPackedEncoder.Encode(input, 0, input.Length, ms);

int[] des = new int[input.Length];
int i = DeltaBinaryPackedEncoder.Decode(ms.ToArray(), des, 0, input.Length, out int b);

Assert.Equal(input, des);
}

[Fact]
public void EncodeAndDecodeInt64_Random_Overflow() {
var r = new Random(0);
int total = 1000;
long[] input = Enumerable.Range(0, total).Select(i => {
byte[] buffer = new byte[8];
r.NextBytes(buffer);
long randomInt64 = BitConverter.ToInt64(buffer, 0);
return randomInt64;
}).ToArray();

using var ms = new MemoryStream();
DeltaBinaryPackedEncoder.Encode(input, 0, input.Length, ms);

long[] des = new long[input.Length];
long i = DeltaBinaryPackedEncoder.Decode(ms.ToArray(), des, 0, input.Length, out int b);

Assert.Equal(input, des);
}
}
}
14 changes: 10 additions & 4 deletions src/Parquet/Encodings/DeltaBinaryPackedEncoder.Variations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ private static void FlushIntBlock(Span<int> block, int minDelta,
if(count < 0)
break;

int max = block.Slice(offset, count).Max();
bitWidths[bwi] = (byte)max.GetBitWidth();
int bitwidth = block.Slice(offset, count).CalculateBitWidth();
bitWidths[bwi] = (byte)bitwidth;
}

// write bit widths
Expand Down Expand Up @@ -86,6 +86,9 @@ private static void EncodeInt(ReadOnlySpan<int> data, Stream destination,
}

if(blockCount > 0) {
while(blockCount < blockSize) {
block[blockCount++] = minDelta;
}
FlushIntBlock(block.Slice(0, blockCount), minDelta, destination, miniblockCount, miniblockSize);
}
}
Expand Down Expand Up @@ -179,8 +182,8 @@ private static void FlushLongBlock(Span<long> block, long minDelta,
if(count < 0)
break;

long max = block.Slice(offset, count).Max();
bitWidths[bwi] = (byte)max.GetBitWidth();
int bitwidth = block.Slice(offset, count).CalculateBitWidth();
bitWidths[bwi] = (byte)bitwidth;
}

// write bit widths
Expand Down Expand Up @@ -240,6 +243,9 @@ private static void EncodeLong(ReadOnlySpan<long> data, Stream destination,
}

if(blockCount > 0) {
while(blockCount < blockSize) {
block[blockCount++] = minDelta;
}
FlushLongBlock(block.Slice(0, blockCount), minDelta, destination, miniblockCount, miniblockSize);
}
}
Expand Down
7 changes: 5 additions & 2 deletions src/Parquet/Encodings/DeltaBinaryPackedEncoder.Variations.tt
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ namespace Parquet.Encodings {
if(count < 0)
break;

<#=nt#> max = block.Slice(offset, count).Max();
bitWidths[bwi] = (byte)max.GetBitWidth();
int bitwidth = block.Slice(offset, count).CalculateBitWidth();
bitWidths[bwi] = (byte)bitwidth;
}

// write bit widths
Expand Down Expand Up @@ -95,6 +95,9 @@ namespace Parquet.Encodings {
}

if(blockCount > 0) {
while(blockCount < blockSize) {
block[blockCount++] = minDelta;
}
Flush<#=ntCap#>Block(block.Slice(0, blockCount), minDelta, destination, miniblockCount, miniblockSize);
}
}
Expand Down
19 changes: 19 additions & 0 deletions src/Parquet/Encodings/DeltaBinaryPackedEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,24 @@ public static int Decode(Span<byte> s, Array dest, int destOffset, int valueCoun

throw new NotSupportedException($"element type {elementType} is not supported");
}


//this extension method calculates the position of the most significant bit that is set to 1
static int CalculateBitWidth(this Span<int> span) {
int mask = 0;
for(int i = 0; i < span.Length; i++) {
mask |= span[i];
}
return 32 - mask.NumberOfLeadingZerosInt();
}

//this extension method calculates the position of the most significant bit that is set to 1
static int CalculateBitWidth(this Span<long> span) {
long mask = 0;
for(int i = 0; i < span.Length; i++) {
mask |= span[i];
}
return 64 - mask.NumberOfLeadingZerosLong();
}
}
}
19 changes: 19 additions & 0 deletions src/Parquet/Extensions/EncodingExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,24 @@ public static void WriteULEB128(this Stream destination, ulong value) {

#endregion

#region Leading Zeros
public static int NumberOfLeadingZerosInt(this int num) {
if(num <= 0)
return num == 0 ? 32 : 0;
int n = 31;
if(num >= 1 << 16) { n -= 16; num >>>= 16; }
if(num >= 1 << 8) { n -= 8; num >>>= 8; }
if(num >= 1 << 4) { n -= 4; num >>>= 4; }
if(num >= 1 << 2) { n -= 2; num >>>= 2; }
return n - (num >>> 1);
}

public static int NumberOfLeadingZerosLong(this long num) {
int x = (int)(num >>> 32);
return x == 0 ? 32 + ((int)num).NumberOfLeadingZerosInt()
: x.NumberOfLeadingZerosInt();
}
#endregion

}
}