Skip to content

Commit

Permalink
Merge branch 'master' into fork/ErikApption/erik/nullable
Browse files Browse the repository at this point in the history
# Conflicts:
#	src/Parquet/Parquet.csproj
  • Loading branch information
aloneguid committed Sep 6, 2024
2 parents 6c45706 + 546db08 commit bab2d5c
Show file tree
Hide file tree
Showing 19 changed files with 352 additions and 35 deletions.
2 changes: 2 additions & 0 deletions docs/release-history.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
### Improvements
- File merger utility has `Stream` overload for non file-based operations.
- File merger utility has extra overload to choose compression codec and specify custom metadata, by @dxdjgl in #519.
- Timestamp logical type is supported, by @cliedeman in #521.
- More data types support encoding using Dictionary encoding, by @EamonHetherton in #531.

## 4.24.0

Expand Down
20 changes: 10 additions & 10 deletions src/Parquet.Floor/Parquet.Floor.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,20 @@

<ItemGroup>
<!--Condition below is needed to remove Avalonia.Diagnostics package from build output in Release configuration.-->
<PackageReference Condition="'$(Configuration)' == 'Debug'" Include="Avalonia.Diagnostics" Version="11.0.10" />
<PackageReference Condition="'$(Configuration)' == 'Debug'" Include="Avalonia.Diagnostics" Version="11.1.3" />

<PackageReference Include="ActiproSoftware.Controls.Avalonia" Version="24.1.0" />
<PackageReference Include="ActiproSoftware.Controls.Avalonia.Themes.DataGrid" Version="24.1.0" />
<PackageReference Include="Avalonia.Desktop" Version="11.0.10" />
<PackageReference Include="Avalonia" Version="11.0.10" />
<PackageReference Include="Avalonia.Controls.DataGrid" Version="11.0.10" />
<PackageReference Include="ActiproSoftware.Controls.Avalonia" Version="24.2.0" />
<PackageReference Include="ActiproSoftware.Controls.Avalonia.Themes.DataGrid" Version="24.2.0" />
<PackageReference Include="Avalonia.Desktop" Version="11.1.3" />
<PackageReference Include="Avalonia" Version="11.1.3" />
<PackageReference Include="Avalonia.Controls.DataGrid" Version="11.1.3" />
<PackageReference Include="Avalonia.Controls.TreeDataGrid" Version="11.0.10" />
<PackageReference Include="Avalonia.Themes.Fluent" Version="11.0.10" />
<PackageReference Include="Avalonia.Themes.Fluent" Version="11.1.3" />
<PackageReference Include="CommunityToolkit.Mvvm" Version="8.2.2" />
<PackageReference Include="Config.Net" Version="5.2.0" />
<PackageReference Include="CsvHelper" Version="32.0.3" />
<PackageReference Include="Projektanker.Icons.Avalonia" Version="9.3.0" />
<PackageReference Include="Projektanker.Icons.Avalonia.FontAwesome" Version="9.3.0" />
<PackageReference Include="CsvHelper" Version="33.0.1" />
<PackageReference Include="Projektanker.Icons.Avalonia" Version="9.4.0" />
<PackageReference Include="Projektanker.Icons.Avalonia.FontAwesome" Version="9.4.0" />
<PackageReference Include="Stowage" Version="2.0.1" />

</ItemGroup>
Expand Down
4 changes: 2 additions & 2 deletions src/Parquet.PerfRunner/Parquet.PerfRunner.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
<PackageReference Include="ParquetSharp" Version="15.0.2.1" />
<PackageReference Include="BenchmarkDotNet" Version="0.14.0" />
<PackageReference Include="ParquetSharp" Version="16.1.0" />
</ItemGroup>

<ItemGroup>
Expand Down
6 changes: 3 additions & 3 deletions src/Parquet.Test/Parquet.Test.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="System.Linq.Async" Version="6.0.1" />
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
<PackageReference Include="xunit" Version="2.8.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.8.1">
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
<PackageReference Include="xunit" Version="2.9.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.8.2">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
Expand Down
2 changes: 1 addition & 1 deletion src/Parquet.Test/Schema/SchemaTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ public async Task Column_called_root() {

[Fact]
public async Task ReadSchemaActuallyEqualToWriteSchema() {
var field = new DateTimeDataField("Date", DateTimeFormat.DateAndTime, true);
var field = new DateTimeDataField("Date", DateTimeFormat.DateAndTime, isNullable: true);
var schema = new ParquetSchema(field);

using var memoryStream = new MemoryStream();
Expand Down
12 changes: 12 additions & 0 deletions src/Parquet.Test/Serialisation/SchemaReflectorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ class DatesPoco {
[ParquetTimestamp]
public DateTime TimestampDate { get; set; }

[ParquetTimestamp(useLogicalTimestamp: true)]
public DateTime LogicalTimestampDate { get; set; }

[ParquetTimestamp]
public DateTime? NullableTimestampDate { get; set; }

Expand Down Expand Up @@ -383,6 +386,15 @@ public void Type_DateTime_Timestamp() {
Assert.Equal(DateTimeFormat.DateAndTime, ((DateTimeDataField)df).DateTimeFormat);
}

[Fact]
public void Type_DateTime_LogicalTimestamp() {
ParquetSchema s = typeof(DatesPoco).GetParquetSchema(true);

DataField df = s.FindDataField(nameof(DatesPoco.LogicalTimestampDate));
Assert.True(df is DateTimeDataField);
Assert.Equal(DateTimeFormat.Timestamp, ((DateTimeDataField)df).DateTimeFormat);
}

[Fact]
public void Type_DateTime_TimestampNullable() {
ParquetSchema s = typeof(DatesPoco).GetParquetSchema(true);
Expand Down
11 changes: 10 additions & 1 deletion src/Parquet.Test/Types/EndToEndTypeTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public class EndToEndTypeTest : TestBase {
["dateTime local kind"] = (new DataField<DateTime>("dateTime unknown kind"), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)),
["impala date local kind"] = (new DateTimeDataField("dateImpala unknown kind", DateTimeFormat.Impala), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)),
["dateDateAndTime local kind"] = (new DateTimeDataField("dateDateAndTime unknown kind", DateTimeFormat.DateAndTime), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)),
["timestamp utc kind"] = (new DateTimeDataField("timestamp utc kind", DateTimeFormat.Timestamp, true), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Utc)),
["timestamp local kind"] = (new DateTimeDataField("timestamp local kind", DateTimeFormat.Timestamp, false), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)),
// don't want any excess info in the offset INT32 doesn't contain or care about this data
["dateDate"] = (new DateTimeDataField("dateDate", DateTimeFormat.Date), DateTime.UtcNow.RoundToDay()),
#if !NETCOREAPP3_1
Expand Down Expand Up @@ -84,7 +86,7 @@ public class EndToEndTypeTest : TestBase {
["unsigned long max value"] = (new DataField<ulong>("ulong"), ulong.MaxValue),

["nullable decimal"] = (new DecimalDataField("decimal?", 4, 1, true, true), null),
["nullable DateTime"] = (new DateTimeDataField("DateTime?", DateTimeFormat.DateAndTime, true), null),
["nullable DateTime"] = (new DateTimeDataField("DateTime?", DateTimeFormat.DateAndTime, isNullable: true), null),

["bool"] = (new DataField<bool>("bool"), true),
["nullable bool"] = (new DataField<bool?>("bool?"), new bool?(true)),
Expand Down Expand Up @@ -124,6 +126,8 @@ public class EndToEndTypeTest : TestBase {
[InlineData("dateTime local kind")]
[InlineData("impala date local kind")]
[InlineData("dateDateAndTime local kind")]
[InlineData("timestamp utc kind")]
[InlineData("timestamp local kind")]
[InlineData("dateDate")]
#if !NETCOREAPP3_1
[InlineData("dateOnly")]
Expand Down Expand Up @@ -174,6 +178,11 @@ public async Task Type_writes_and_reads_end_to_end(string name) {
equal = true;
else if(actual.GetType().IsArrayOf<byte>() && input.expectedValue != null) {
equal = ((byte[])actual).SequenceEqual((byte[])input.expectedValue);
} else if(input.field is DateTimeDataField { DateTimeFormat: DateTimeFormat.Timestamp }) {
var dtActual = (DateTime)actual;
var dtExpected = (DateTime)input.expectedValue!;
Assert.Equal(dtExpected.Kind, dtActual.Kind);
equal = dtActual.Equals(dtExpected);
} else if(actual.GetType() == typeof(DateTime)) {
var dtActual = (DateTime)actual;
Assert.Equal(DateTimeKind.Utc, dtActual.Kind);
Expand Down
4 changes: 2 additions & 2 deletions src/Parquet/Data/DataColumnStatistics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ public DataColumnStatistics(long? nullCount, long? distinctCount, object? minVal
internal Statistics ToThriftStatistics(SchemaElement tse) {

if(!ParquetPlainEncoder.TryEncode(MinValue, tse, out byte[]? min)) {
throw new ArgumentException($"cound not encode {MinValue}", nameof(MinValue));
throw new ArgumentException($"could not encode {MinValue}", nameof(MinValue));
}

if(!ParquetPlainEncoder.TryEncode(MaxValue, tse, out byte[]? max)) {
throw new ArgumentException($"cound not encode {MinValue}", nameof(MinValue));
throw new ArgumentException($"could not encode {MinValue}", nameof(MinValue));
}

return new Statistics {
Expand Down
116 changes: 111 additions & 5 deletions src/Parquet/Encodings/ParquetDictionaryEncoder.cs
Original file line number Diff line number Diff line change
@@ -1,22 +1,69 @@
using System;
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;

namespace Parquet.Encodings {
static class ParquetDictionaryEncoder {

public static bool TryExtractDictionary(Type elementType,
Array data, int offset, int count,
out Array? dictionaryArray,
out int[]? rentedIndexes,
[NotNullWhen(true)] out Array? dictionaryArray,
[NotNullWhen(true)] out int[]? rentedIndexes,
double threshold = 0.8) {

dictionaryArray = null;
rentedIndexes = null;

if(elementType != typeof(string) || count == 0)
if(count == 0) {
return false;
}
if(elementType == typeof(string)) {
//Initially at least we will leave the existing string dictionary code path intact as there are some
//string specific optimizations in place.
return EncodeStrings(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(DateTime)) {
return Encode<DateTime>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(decimal)) {
return Encode<decimal>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(byte)) {
return Encode<byte>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(short)) {
return Encode<short>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(ushort)) {
return Encode<ushort>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(int)) {
return Encode<int>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(uint)) {
return Encode<uint>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(long)) {
return Encode<long>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(ulong)) {
return Encode<ulong>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(float)) {
return Encode<float>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
if(elementType == typeof(double)) {
return Encode<double>(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold);
}
return false;
}
private static bool EncodeStrings(Array data,
int offset,
int count,
[NotNullWhen(true)] ref Array? dictionaryArray,
[NotNullWhen(true)] ref int[]? rentedIndexes,
double threshold) {

string[] src = (string[])data;
HashSet<string> distinctSet = Distinct(src, offset, count);
Expand All @@ -37,7 +84,6 @@ public static bool TryExtractDictionary(Type elementType,

return true;
}

private static HashSet<string> Distinct(string[] strings, int offset, int count) {

/*
Expand All @@ -58,5 +104,65 @@ private static HashSet<string> Distinct(string[] strings, int offset, int count)

return hs;
}

private static bool Encode<T>(Array data,
int offset,
int count,
[NotNullWhen(true)] ref Array? dictionaryArray,
[NotNullWhen(true)] ref int[]? rentedIndexes,
double threshold) where T : notnull {
var src = (T[])data;

//TODO: calculate some more statistics beyond uniquness like run lengths, index size and index bitwidth to determine if there is value
//in dictionary encoding this data vs PLAIN encoding
//e.g. Dictionary encoding for byte values could be worse than plain even with 50% uniqueness depending on run lengths and value spread
Dictionary<T, (int Count, int MaxRunLength, int Index)> distinctSet = Distinct(src, offset, count, EqualityComparer<T>.Default);
double uniquenessFactor = distinctSet.Count / (double)count;
if(uniquenessFactor > threshold)
return false;

T[] dictionary = distinctSet.Keys.ToArray();
dictionaryArray = dictionary;

rentedIndexes = ArrayPool<int>.Shared.Rent(count);
for(int isrc = offset, itgt = 0; isrc < offset + count; isrc++, itgt++)
rentedIndexes[itgt] = distinctSet[src[isrc]].Index;

return true;
}

private static Dictionary<T, (int Count, int MaxRunLength, int Index)> Distinct<T>(T[] values, int offset, int count, EqualityComparer<T> equalityComparer)
where T : notnull {
if(values.Length == 0) {
return new(0);
}
var dict = new Dictionary<T, (int Count, int MaxRunLength, int Index)>(values.Length);
T previous = values[offset];
int runLength = 1;
int index = 0;
dict[previous] = (1, 1, index++);
for(int i = offset + 1; i < offset + count; i++) {
T key = values[i];
if(equalityComparer.Equals(key, previous)) {
(int Count, int MaxRunLength, int Index) previousData = dict[key];
dict[key] = (previousData.Count + 1, previousData.MaxRunLength, previousData.Index);
} else {
(int Count, int MaxRunLength, int Index) previousData = dict[previous];
if(previousData.MaxRunLength < runLength) {
dict[previous] = (previousData.Count, runLength, previousData.Index);
}
if(dict.TryGetValue(key, out (int Count, int MaxRunLength, int Index) value)) {
dict[key] = (value.Count + 1, value.MaxRunLength, value.Index);
} else {
dict[key] = (1, 1, index++);
}

runLength = 1;
previous = key;
}

}
return dict;
}
}
}
Loading

0 comments on commit bab2d5c

Please sign in to comment.