From 873eb737bfd1d6f007e5caff3802285a6da45739 Mon Sep 17 00:00:00 2001 From: IG Date: Fri, 6 Sep 2024 13:47:08 +0100 Subject: [PATCH 1/6] upgrade packages --- src/Parquet.Floor/Parquet.Floor.csproj | 20 +++++++++---------- .../Parquet.PerfRunner.csproj | 4 ++-- src/Parquet.Test/Parquet.Test.csproj | 6 +++--- src/Parquet/Parquet.csproj | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/Parquet.Floor/Parquet.Floor.csproj b/src/Parquet.Floor/Parquet.Floor.csproj index 28c60658..6a23f7e5 100644 --- a/src/Parquet.Floor/Parquet.Floor.csproj +++ b/src/Parquet.Floor/Parquet.Floor.csproj @@ -34,20 +34,20 @@ - + - - - - - + + + + + - + - - - + + + diff --git a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj index 7e487c1e..e0ca1d1f 100644 --- a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj +++ b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj @@ -11,8 +11,8 @@ - - + + diff --git a/src/Parquet.Test/Parquet.Test.csproj b/src/Parquet.Test/Parquet.Test.csproj index 3f9fdf1b..e0c9e937 100644 --- a/src/Parquet.Test/Parquet.Test.csproj +++ b/src/Parquet.Test/Parquet.Test.csproj @@ -20,9 +20,9 @@ - - - + + + runtime; build; native; contentfiles; analyzers; buildtransitive all diff --git a/src/Parquet/Parquet.csproj b/src/Parquet/Parquet.csproj index 033cc58e..6f46c8f2 100644 --- a/src/Parquet/Parquet.csproj +++ b/src/Parquet/Parquet.csproj @@ -53,17 +53,17 @@ - + - + - + From 2435b8d628073525923e57fb5b33892387d4f717 Mon Sep 17 00:00:00 2001 From: Ciaran Liedeman Date: Sun, 16 Jun 2024 00:06:45 +0200 Subject: [PATCH 2/6] feat: Logical Timestamp --- src/Parquet.Test/Schema/SchemaTest.cs | 2 +- .../Serialisation/SchemaReflectorTest.cs | 12 +++++ src/Parquet.Test/Types/EndToEndTypeTest.cs | 11 ++++- src/Parquet/Encodings/ParquetPlainEncoder.cs | 49 ++++++++++++++++++- src/Parquet/Encodings/SchemaEncoder.cs | 26 +++++++++- src/Parquet/Extensions/OtherExtensions.cs | 9 ++++ src/Parquet/Schema/DateTimeDataField.cs | 28 ++++++++++- src/Parquet/Schema/DateTimeFormat.cs | 7 ++- src/Parquet/Schema/DateTimeTimeUnit.cs | 19 +++++++ .../Attributes/ParquetTimestampAttribute.cs | 17 ++++++- src/Parquet/Serialization/TypeExtensions.cs | 44 ++++++++++++++++- 11 files changed, 215 insertions(+), 9 deletions(-) create mode 100644 src/Parquet/Schema/DateTimeTimeUnit.cs diff --git a/src/Parquet.Test/Schema/SchemaTest.cs b/src/Parquet.Test/Schema/SchemaTest.cs index 3aba6991..de9b87a3 100644 --- a/src/Parquet.Test/Schema/SchemaTest.cs +++ b/src/Parquet.Test/Schema/SchemaTest.cs @@ -293,7 +293,7 @@ public async Task Column_called_root() { [Fact] public async Task ReadSchemaActuallyEqualToWriteSchema() { - var field = new DateTimeDataField("Date", DateTimeFormat.DateAndTime, true); + var field = new DateTimeDataField("Date", DateTimeFormat.DateAndTime, isNullable: true); var schema = new ParquetSchema(field); using var memoryStream = new MemoryStream(); diff --git a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs index 3f803374..8eb002d7 100644 --- a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs +++ b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs @@ -323,6 +323,9 @@ class DatesPoco { [ParquetTimestamp] public DateTime TimestampDate { get; set; } + [ParquetTimestamp(useLogicalTimestamp: true)] + public DateTime LogicalTimestampDate { get; set; } + [ParquetTimestamp] public DateTime? NullableTimestampDate { get; set; } @@ -383,6 +386,15 @@ public void Type_DateTime_Timestamp() { Assert.Equal(DateTimeFormat.DateAndTime, ((DateTimeDataField)df).DateTimeFormat); } + [Fact] + public void Type_DateTime_LogicalTimestamp() { + ParquetSchema s = typeof(DatesPoco).GetParquetSchema(true); + + DataField df = s.FindDataField(nameof(DatesPoco.LogicalTimestampDate)); + Assert.True(df is DateTimeDataField); + Assert.Equal(DateTimeFormat.Timestamp, ((DateTimeDataField)df).DateTimeFormat); + } + [Fact] public void Type_DateTime_TimestampNullable() { ParquetSchema s = typeof(DatesPoco).GetParquetSchema(true); diff --git a/src/Parquet.Test/Types/EndToEndTypeTest.cs b/src/Parquet.Test/Types/EndToEndTypeTest.cs index 8403ef9a..6f77f48f 100644 --- a/src/Parquet.Test/Types/EndToEndTypeTest.cs +++ b/src/Parquet.Test/Types/EndToEndTypeTest.cs @@ -49,6 +49,8 @@ public class EndToEndTypeTest : TestBase { ["dateTime local kind"] = (new DataField("dateTime unknown kind"), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), ["impala date local kind"] = (new DateTimeDataField("dateImpala unknown kind", DateTimeFormat.Impala), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), ["dateDateAndTime local kind"] = (new DateTimeDataField("dateDateAndTime unknown kind", DateTimeFormat.DateAndTime), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), + ["timestamp utc kind"] = (new DateTimeDataField("timestamp utc kind", DateTimeFormat.Timestamp, true), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Utc)), + ["timestamp local kind"] = (new DateTimeDataField("timestamp local kind", DateTimeFormat.Timestamp, false), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), // don't want any excess info in the offset INT32 doesn't contain or care about this data ["dateDate"] = (new DateTimeDataField("dateDate", DateTimeFormat.Date), DateTime.UtcNow.RoundToDay()), #if !NETCOREAPP3_1 @@ -84,7 +86,7 @@ public class EndToEndTypeTest : TestBase { ["unsigned long max value"] = (new DataField("ulong"), ulong.MaxValue), ["nullable decimal"] = (new DecimalDataField("decimal?", 4, 1, true, true), null), - ["nullable DateTime"] = (new DateTimeDataField("DateTime?", DateTimeFormat.DateAndTime, true), null), + ["nullable DateTime"] = (new DateTimeDataField("DateTime?", DateTimeFormat.DateAndTime, isNullable: true), null), ["bool"] = (new DataField("bool"), true), ["nullable bool"] = (new DataField("bool?"), new bool?(true)), @@ -124,6 +126,8 @@ public class EndToEndTypeTest : TestBase { [InlineData("dateTime local kind")] [InlineData("impala date local kind")] [InlineData("dateDateAndTime local kind")] + [InlineData("timestamp utc kind")] + [InlineData("timestamp local kind")] [InlineData("dateDate")] #if !NETCOREAPP3_1 [InlineData("dateOnly")] @@ -174,6 +178,11 @@ public async Task Type_writes_and_reads_end_to_end(string name) { equal = true; else if(actual.GetType().IsArrayOf() && input.expectedValue != null) { equal = ((byte[])actual).SequenceEqual((byte[])input.expectedValue); + } else if(input.field is DateTimeDataField { DateTimeFormat: DateTimeFormat.Timestamp }) { + var dtActual = (DateTime)actual; + var dtExpected = (DateTime)input.expectedValue!; + Assert.Equal(dtExpected.Kind, dtActual.Kind); + equal = dtActual.Equals(dtExpected); } else if(actual.GetType() == typeof(DateTime)) { var dtActual = (DateTime)actual; Assert.Equal(DateTimeKind.Utc, dtActual.Kind); diff --git a/src/Parquet/Encodings/ParquetPlainEncoder.cs b/src/Parquet/Encodings/ParquetPlainEncoder.cs index 1658c22a..e04620bb 100644 --- a/src/Parquet/Encodings/ParquetPlainEncoder.cs +++ b/src/Parquet/Encodings/ParquetPlainEncoder.cs @@ -1,5 +1,6 @@ using System; using System.Buffers; +using System.Diagnostics; using System.IO; using System.Linq; using System.Numerics; @@ -822,7 +823,27 @@ public static void Encode(ReadOnlySpan data, Stream destination, Schem } break; case TType.INT64: - if(tse.ConvertedType == ConvertedType.TIMESTAMP_MILLIS) { + if(tse.LogicalType?.TIMESTAMP is not null) { + foreach(DateTime element in data) { + if(tse.LogicalType.TIMESTAMP.Unit.MILLIS is not null) { + long unixTime = element.ToUnixMilliseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); +#if NET7_0_OR_GREATER + } else if (tse.LogicalType.TIMESTAMP.Unit.MICROS is not null) { + long unixTime = element.ToUtc().ToUnixMicroseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); + } else if (tse.LogicalType.TIMESTAMP.Unit.NANOS is not null) { + long unixTime = element.ToUtc().ToUnixNanoseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); +#endif + } else { + throw new ParquetException($"Unexpected TimeUnit: {tse.LogicalType.TIMESTAMP.Unit}"); + } + } + } else if(tse.ConvertedType == ConvertedType.TIMESTAMP_MILLIS) { foreach(DateTime element in data) { long unixTime = element.ToUtc().ToUnixMilliseconds(); byte[] raw = BitConverter.GetBytes(unixTime); @@ -896,7 +917,31 @@ public static int Decode(Span source, Span data, SchemaElement t long[] longs = ArrayPool.Shared.Rent(data.Length); try { int longsRead = Decode(source, longs.AsSpan(0, data.Length)); - if(tse.ConvertedType == ConvertedType.TIMESTAMP_MICROS) { + if(tse.LogicalType?.TIMESTAMP is not null) { + for(int i = 0; i < longsRead; i++) { + if(tse.LogicalType.TIMESTAMP.Unit.MILLIS is not null) { + DateTime dt = longs[i].AsUnixMillisecondsInDateTime(); + dt = DateTime.SpecifyKind(dt, tse.LogicalType.TIMESTAMP.IsAdjustedToUTC ? DateTimeKind.Utc : DateTimeKind.Local); + data[i] = dt; + } else if(tse.LogicalType.TIMESTAMP.Unit.MICROS is not null) { + long lv = longs[i]; + long microseconds = lv % 1000; + lv /= 1000; + DateTime dt = lv.AsUnixMillisecondsInDateTime().AddTicks(microseconds * 10); + dt = DateTime.SpecifyKind(dt, tse.LogicalType.TIMESTAMP.IsAdjustedToUTC ? DateTimeKind.Utc : DateTimeKind.Local); + data[i] = dt; + } else if(tse.LogicalType.TIMESTAMP.Unit.NANOS is not null) { + long lv = longs[i]; + long nanoseconds = lv % 1000000; + lv /= 1000000; + DateTime dt = lv.AsUnixMillisecondsInDateTime().AddTicks(nanoseconds / 100); // 1 tick = 100 nanoseconds + dt = DateTime.SpecifyKind(dt, tse.LogicalType.TIMESTAMP.IsAdjustedToUTC ? DateTimeKind.Utc : DateTimeKind.Local); + data[i] = dt; + } else { + throw new ParquetException($"Unexpected TimeUnit: {tse.LogicalType.TIMESTAMP.Unit}"); + } + } + } else if(tse.ConvertedType == ConvertedType.TIMESTAMP_MICROS) { for(int i = 0; i < longsRead; i++) { long lv = longs[i]; long microseconds = lv % 1000; diff --git a/src/Parquet/Encodings/SchemaEncoder.cs b/src/Parquet/Encodings/SchemaEncoder.cs index 34c8cb84..866ab8e2 100644 --- a/src/Parquet/Encodings/SchemaEncoder.cs +++ b/src/Parquet/Encodings/SchemaEncoder.cs @@ -6,6 +6,7 @@ using Parquet.File.Values.Primitives; using Parquet.Meta; using Parquet.Schema; +using Parquet.Serialization; using SType = System.Type; using Type = Parquet.Meta.Type; @@ -201,7 +202,7 @@ private static bool TryBuildDataField(SchemaElement se, ParquetOptions options, _ => typeof(int) }, Type.INT32 => typeof(int), - + Type.INT64 when se.LogicalType?.TIMESTAMP != null => typeof(DateTime), Type.INT64 when se.ConvertedType != null => se.ConvertedType switch { ConvertedType.INT_64 => typeof(long), ConvertedType.UINT_64 => typeof(ulong), @@ -267,6 +268,10 @@ private static DataField GetDecimalDataField(SchemaElement se) => se.Scale.GetValueOrDefault(DecimalFormatDefaults.DefaultScale)); private static DataField GetDateTimeDataField(SchemaElement se) { + if(se.LogicalType is not null) + if(se.LogicalType.TIMESTAMP is not null) + return new DateTimeDataField(se.Name, DateTimeFormat.Timestamp, isAdjustedToUTC: se.LogicalType.TIMESTAMP.IsAdjustedToUTC, unit: se.LogicalType.TIMESTAMP.Unit.Convert()); + switch(se.ConvertedType) { case ConvertedType.TIMESTAMP_MILLIS: if(se.Type == Type.INT64) @@ -465,6 +470,25 @@ public static SchemaElement Encode(DataField field) { tse.Type = Type.INT32; tse.ConvertedType = ConvertedType.DATE; break; + case DateTimeFormat.Timestamp: + tse.Type = Type.INT64; + tse.LogicalType = new LogicalType { TIMESTAMP = new TimestampType { + IsAdjustedToUTC = dfDateTime.IsAdjustedToUTC, + Unit = dfDateTime.Unit switch { + DateTimeTimeUnit.Millis => new TimeUnit { + MILLIS = new MilliSeconds(), + }, + DateTimeTimeUnit.Micros => new TimeUnit { + MICROS = new MicroSeconds(), + }, + DateTimeTimeUnit.Nanos => new TimeUnit { + NANOS = new NanoSeconds(), + }, + _ => throw new ParquetException($"Unexpected TimeUnit: {dfDateTime.Unit}") + } + } + }; + break; default: tse.Type = Type.INT96; break; diff --git a/src/Parquet/Extensions/OtherExtensions.cs b/src/Parquet/Extensions/OtherExtensions.cs index db41de5c..f2e4c1a8 100644 --- a/src/Parquet/Extensions/OtherExtensions.cs +++ b/src/Parquet/Extensions/OtherExtensions.cs @@ -9,6 +9,10 @@ static class OtherExtensions { private static readonly DateTime UnixEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); private const long UnixEpochMilliseconds = 62_135_596_800_000L; private const long UnixEpochMicroseconds = 62_135_596_800_000_000L; + +#if NET7_0_OR_GREATER + private static long UnixEpochNanoseconds = UnixEpoch.Ticks * TimeSpan.NanosecondsPerTick; +#endif public static DateTimeOffset FromUnixMilliseconds(this long unixMilliseconds) { return UnixEpoch.AddMilliseconds(unixMilliseconds); @@ -28,6 +32,11 @@ public static long ToUnixMicroseconds(this DateTime dto) { long microseconds = dto.Ticks / TimeSpan.TicksPerMicrosecond; return microseconds - UnixEpochMicroseconds; } + + public static long ToUnixNanoseconds(this DateTime dto) { + long nanoseconds = dto.Ticks * TimeSpan.NanosecondsPerTick; + return nanoseconds - UnixEpochNanoseconds; + } #endif public static DateTime AsUnixDaysInDateTime(this int unixDays) { diff --git a/src/Parquet/Schema/DateTimeDataField.cs b/src/Parquet/Schema/DateTimeDataField.cs index 1b8dab2a..1747d762 100644 --- a/src/Parquet/Schema/DateTimeDataField.cs +++ b/src/Parquet/Schema/DateTimeDataField.cs @@ -9,18 +9,44 @@ public class DateTimeDataField : DataField { /// Desired data format, Parquet specific /// public DateTimeFormat DateTimeFormat { get; } + + /// + /// IsAdjustedToUTC + /// + public bool IsAdjustedToUTC { get; } + + /// + /// TimeUnit + /// + public DateTimeTimeUnit Unit { get; } /// /// Initializes a new instance of the class. /// /// The name. /// The format. + /// + /// /// /// /// When set, uses this property to get the field's data. When not set, uses the property that matches the name parameter. - public DateTimeDataField(string name, DateTimeFormat format, bool? isNullable = null, bool? isArray = null, string? propertyName = null) + public DateTimeDataField(string name, DateTimeFormat format, bool isAdjustedToUTC = true, DateTimeTimeUnit? unit = null, bool? isNullable = null, bool? isArray = null, string? propertyName = null) : base(name, typeof(DateTime), isNullable, isArray, propertyName) { DateTimeFormat = format; + IsAdjustedToUTC = isAdjustedToUTC; + + // Override the unit for legacy types + if(format == DateTimeFormat.DateAndTime) { + Unit = DateTimeTimeUnit.Millis; + } +#if NET7_0_OR_GREATER + else if(format == DateTimeFormat.DateAndTimeMicros) { + Unit = DateTimeTimeUnit.Micros; + } +#endif + else { + Unit = unit ?? DateTimeTimeUnit.Millis; + } } } } \ No newline at end of file diff --git a/src/Parquet/Schema/DateTimeFormat.cs b/src/Parquet/Schema/DateTimeFormat.cs index d888b768..354fc358 100644 --- a/src/Parquet/Schema/DateTimeFormat.cs +++ b/src/Parquet/Schema/DateTimeFormat.cs @@ -25,6 +25,11 @@ public enum DateTimeFormat { /// /// Only stores a date. Time portion is truncated. Internally stored as INT32 /// - Date + Date, + + /// + /// Logical Type Timestamp. + /// + Timestamp, } } diff --git a/src/Parquet/Schema/DateTimeTimeUnit.cs b/src/Parquet/Schema/DateTimeTimeUnit.cs new file mode 100644 index 00000000..0688d135 --- /dev/null +++ b/src/Parquet/Schema/DateTimeTimeUnit.cs @@ -0,0 +1,19 @@ +namespace Parquet.Schema { + /// + /// Support Time/Timestamp Units + /// + public enum DateTimeTimeUnit { + /// + /// Millisecond Precision + /// + Millis, + /// + /// Microsecond Precision + /// + Micros, + /// + /// Nanosecond Precision, note dotnet does not support full Nano second precision for DateTime + /// + Nanos, + } +} \ No newline at end of file diff --git a/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs b/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs index 16752191..18c6be28 100644 --- a/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs +++ b/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs @@ -34,16 +34,31 @@ public class ParquetTimestampAttribute : Attribute { /// Creates an instance of the attribute /// /// - public ParquetTimestampAttribute(ParquetTimestampResolution resolution = ParquetTimestampResolution.Milliseconds) { + /// + public ParquetTimestampAttribute(ParquetTimestampResolution resolution = ParquetTimestampResolution.Milliseconds, bool useLogicalTimestamp = false) { Resolution = resolution; + UseLogicalTimestamp = useLogicalTimestamp; } /// /// Resolution of Parquet timestamp /// public ParquetTimestampResolution Resolution { get; private set; } + + /// + /// Resolution of Parquet timestamp + /// + public bool UseLogicalTimestamp { get; private set; } + + /// + /// IsAdjustedToUTC + /// + public bool IsAdjustedToUTC { get; private set; } internal DateTimeFormat GetDateTimeFormat() { + if(UseLogicalTimestamp) + return DateTimeFormat.Timestamp; + return Resolution switch { ParquetTimestampResolution.Milliseconds => DateTimeFormat.DateAndTime, #if NET7_0_OR_GREATER diff --git a/src/Parquet/Serialization/TypeExtensions.cs b/src/Parquet/Serialization/TypeExtensions.cs index 80e08dec..eeff08f5 100644 --- a/src/Parquet/Serialization/TypeExtensions.cs +++ b/src/Parquet/Serialization/TypeExtensions.cs @@ -139,7 +139,9 @@ private static Field ConstructDataField(string name, string propertyName, Type t ParquetTimestampAttribute? tsa = member?.TimestampAttribute; r = new DateTimeDataField(name, tsa == null ? DateTimeFormat.Impala : tsa.GetDateTimeFormat(), - t == typeof(DateTime?), null, propertyName); + isAdjustedToUTC: tsa == null ? true : tsa.IsAdjustedToUTC, + unit: tsa?.Resolution.Convert(), + isNullable: t == typeof(DateTime?), null, propertyName); } else if(t == typeof(TimeSpan) || t == typeof(TimeSpan?)) { r = new TimeSpanDataField(name, member?.MicroSecondsTimeAttribute == null @@ -272,5 +274,45 @@ private static ParquetSchema CreateSchema(Type t, bool forWriting) { return new ParquetSchema(fields); } + + /// + /// Convert Resolution to TimeUnit + /// + /// + /// + public static DateTimeTimeUnit Convert(this ParquetTimestampResolution resolution) { + switch(resolution) { + case ParquetTimestampResolution.Milliseconds: + return DateTimeTimeUnit.Millis; +#if NET7_0_OR_GREATER + case ParquetTimestampResolution.Microseconds: + return DateTimeTimeUnit.Micros; +#endif + default: + throw new ParquetException($"Unexpected Resolution: {resolution}"); + // nanoseconds to be added + } + } + + /// + /// Convert Parquet TimeUnit to TimeUnit + /// + /// + /// + public static DateTimeTimeUnit Convert(this Parquet.Meta.TimeUnit unit) { + if(unit.MILLIS is not null) { + return DateTimeTimeUnit.Millis; + } + + if(unit.MICROS is not null) { + return DateTimeTimeUnit.Micros; + } + + if(unit.NANOS is not null) { + return DateTimeTimeUnit.Nanos; + } + + throw new ParquetException($"Unexpected TimeUnit: {unit}"); + } } } \ No newline at end of file From 940984cef120b403f04be0f90ebe335bfa2d30a4 Mon Sep 17 00:00:00 2001 From: IG Date: Fri, 6 Sep 2024 14:22:24 +0100 Subject: [PATCH 3/6] update release history --- docs/release-history.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-history.md b/docs/release-history.md index 5dcaedce..c94755fd 100644 --- a/docs/release-history.md +++ b/docs/release-history.md @@ -3,6 +3,7 @@ ### Improvements - File merger utility has `Stream` overload for non file-based operations. - File merger utility has extra overload to choose compression codec and specify custom metadata, by @dxdjgl in #519. +- Timestamp logical type is supported, by @cliedeman in #521. ## 4.24.0 From 8535bcfa36b1059e64e8919cc7f63c8f9281536b Mon Sep 17 00:00:00 2001 From: Ciaran Liedeman <3578740+cliedeman@users.noreply.github.com> Date: Wed, 19 Jun 2024 12:45:38 +0200 Subject: [PATCH 4/6] fix: Fixed Typo --- src/Parquet/Data/DataColumnStatistics.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Parquet/Data/DataColumnStatistics.cs b/src/Parquet/Data/DataColumnStatistics.cs index 312a1d6b..c6228912 100644 --- a/src/Parquet/Data/DataColumnStatistics.cs +++ b/src/Parquet/Data/DataColumnStatistics.cs @@ -47,11 +47,11 @@ public DataColumnStatistics(long? nullCount, long? distinctCount, object? minVal internal Statistics ToThriftStatistics(SchemaElement tse) { if(!ParquetPlainEncoder.TryEncode(MinValue, tse, out byte[]? min)) { - throw new ArgumentException($"cound not encode {MinValue}", nameof(MinValue)); + throw new ArgumentException($"could not encode {MinValue}", nameof(MinValue)); } if(!ParquetPlainEncoder.TryEncode(MaxValue, tse, out byte[]? max)) { - throw new ArgumentException($"cound not encode {MinValue}", nameof(MinValue)); + throw new ArgumentException($"could not encode {MinValue}", nameof(MinValue)); } return new Statistics { From 62ca0f04104bfed8f95cda606f538226e99f5db6 Mon Sep 17 00:00:00 2001 From: Eamon Hetherton Date: Fri, 6 Sep 2024 23:31:52 +1000 Subject: [PATCH 5/6] Merge pull request #534 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update vunerable System.Text.Json package and add Nullable attribute … * Add support for dictionary encoding for more data types beyond strings * Merge branch 'master' into fork/EamonHetherton/Issue_531 --- .../Encodings/ParquetDictionaryEncoder.cs | 116 +++++++++++++++++- src/Parquet/Parquet.csproj | 1 + src/Parquet/ParquetOptions.cs | 8 +- 3 files changed, 117 insertions(+), 8 deletions(-) diff --git a/src/Parquet/Encodings/ParquetDictionaryEncoder.cs b/src/Parquet/Encodings/ParquetDictionaryEncoder.cs index 23c01c2a..95fec97a 100644 --- a/src/Parquet/Encodings/ParquetDictionaryEncoder.cs +++ b/src/Parquet/Encodings/ParquetDictionaryEncoder.cs @@ -1,6 +1,7 @@ using System; using System.Buffers; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Linq; namespace Parquet.Encodings { @@ -8,15 +9,61 @@ static class ParquetDictionaryEncoder { public static bool TryExtractDictionary(Type elementType, Array data, int offset, int count, - out Array? dictionaryArray, - out int[]? rentedIndexes, + [NotNullWhen(true)] out Array? dictionaryArray, + [NotNullWhen(true)] out int[]? rentedIndexes, double threshold = 0.8) { - dictionaryArray = null; rentedIndexes = null; - if(elementType != typeof(string) || count == 0) + if(count == 0) { return false; + } + if(elementType == typeof(string)) { + //Initially at least we will leave the existing string dictionary code path intact as there are some + //string specific optimizations in place. + return EncodeStrings(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(DateTime)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(decimal)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(byte)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(short)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(ushort)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(int)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(uint)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(long)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(ulong)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(float)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + if(elementType == typeof(double)) { + return Encode(data, offset, count, ref dictionaryArray, ref rentedIndexes, threshold); + } + return false; + } + private static bool EncodeStrings(Array data, + int offset, + int count, + [NotNullWhen(true)] ref Array? dictionaryArray, + [NotNullWhen(true)] ref int[]? rentedIndexes, + double threshold) { string[] src = (string[])data; HashSet distinctSet = Distinct(src, offset, count); @@ -37,7 +84,6 @@ public static bool TryExtractDictionary(Type elementType, return true; } - private static HashSet Distinct(string[] strings, int offset, int count) { /* @@ -58,5 +104,65 @@ private static HashSet Distinct(string[] strings, int offset, int count) return hs; } + + private static bool Encode(Array data, + int offset, + int count, + [NotNullWhen(true)] ref Array? dictionaryArray, + [NotNullWhen(true)] ref int[]? rentedIndexes, + double threshold) where T : notnull { + var src = (T[])data; + + //TODO: calculate some more statistics beyond uniquness like run lengths, index size and index bitwidth to determine if there is value + //in dictionary encoding this data vs PLAIN encoding + //e.g. Dictionary encoding for byte values could be worse than plain even with 50% uniqueness depending on run lengths and value spread + Dictionary distinctSet = Distinct(src, offset, count, EqualityComparer.Default); + double uniquenessFactor = distinctSet.Count / (double)count; + if(uniquenessFactor > threshold) + return false; + + T[] dictionary = distinctSet.Keys.ToArray(); + dictionaryArray = dictionary; + + rentedIndexes = ArrayPool.Shared.Rent(count); + for(int isrc = offset, itgt = 0; isrc < offset + count; isrc++, itgt++) + rentedIndexes[itgt] = distinctSet[src[isrc]].Index; + + return true; + } + + private static Dictionary Distinct(T[] values, int offset, int count, EqualityComparer equalityComparer) + where T : notnull { + if(values.Length == 0) { + return new(0); + } + var dict = new Dictionary(values.Length); + T previous = values[offset]; + int runLength = 1; + int index = 0; + dict[previous] = (1, 1, index++); + for(int i = offset + 1; i < offset + count; i++) { + T key = values[i]; + if(equalityComparer.Equals(key, previous)) { + (int Count, int MaxRunLength, int Index) previousData = dict[key]; + dict[key] = (previousData.Count + 1, previousData.MaxRunLength, previousData.Index); + } else { + (int Count, int MaxRunLength, int Index) previousData = dict[previous]; + if(previousData.MaxRunLength < runLength) { + dict[previous] = (previousData.Count, runLength, previousData.Index); + } + if(dict.TryGetValue(key, out (int Count, int MaxRunLength, int Index) value)) { + dict[key] = (value.Count + 1, value.MaxRunLength, value.Index); + } else { + dict[key] = (1, 1, index++); + } + + runLength = 1; + previous = key; + } + + } + return dict; + } } } diff --git a/src/Parquet/Parquet.csproj b/src/Parquet/Parquet.csproj index 6f46c8f2..72486a3c 100644 --- a/src/Parquet/Parquet.csproj +++ b/src/Parquet/Parquet.csproj @@ -60,6 +60,7 @@ + diff --git a/src/Parquet/ParquetOptions.cs b/src/Parquet/ParquetOptions.cs index 9a6bb7f1..0efe99a3 100644 --- a/src/Parquet/ParquetOptions.cs +++ b/src/Parquet/ParquetOptions.cs @@ -38,13 +38,15 @@ public class ParquetOptions { #endif /// - /// Whether to use dictionary encoding for string columns. Other column types are not supported. + /// Whether to use dictionary encoding for columns if data meets + /// The following CLR types are currently supported: + /// , , , , , , , , , , , "/> /// public bool UseDictionaryEncoding { get; set; } = true; /// - /// String dictionary uniqueness threshold, which is a value from 0 (no unique values) - /// to 1 (all values are unique) indicating when string dictionary encoding is applied. + /// Dictionary uniqueness threshold, which is a value from 0 (no unique values) + /// to 1 (all values are unique) indicating when dictionary encoding is applied. /// Uniqueness factor needs to be less or equal than this threshold. /// public double DictionaryEncodingThreshold { get; set; } = 0.8; From 546db08d3aa8a4fd8032f14adcb111110cb06aee Mon Sep 17 00:00:00 2001 From: IG Date: Fri, 6 Sep 2024 14:33:00 +0100 Subject: [PATCH 6/6] update release history --- docs/release-history.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-history.md b/docs/release-history.md index c94755fd..e9e3e1ee 100644 --- a/docs/release-history.md +++ b/docs/release-history.md @@ -4,6 +4,7 @@ - File merger utility has `Stream` overload for non file-based operations. - File merger utility has extra overload to choose compression codec and specify custom metadata, by @dxdjgl in #519. - Timestamp logical type is supported, by @cliedeman in #521. +- More data types support encoding using Dictionary encoding, by @EamonHetherton in #531. ## 4.24.0