diff --git a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs index 5919c948..d8d8b267 100644 --- a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs +++ b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs @@ -891,6 +891,35 @@ public async Task Interface_Serialize() { Assert.Equivalent(data, data2); } + + [Fact] + public async Task DateTimeOffset_Serialize() { + var ms = new MemoryStream(); + var value = new DateTimeOffset(2020, 1, 1, 0, 0, 0, TimeSpan.Zero); + + var expected = new DateTimeOffsetClass { + DateTimeOffset = value, + }; + + //write + await ParquetSerializer.SerializeAsync([expected], ms); + + //read back + ms.Position = 0; + IList data = await ParquetSerializer.DeserializeAsync(ms); + DateTimeOffsetClass actual = Assert.Single(data); + Assert.Equal(expected.DateTimeOffset, actual.DateTimeOffset); + + //write append + ms.Position = 0; + await ParquetSerializer.SerializeAsync([expected], ms, new ParquetSerializerOptions { + Append = true, + }); + } + + private class DateTimeOffsetClass { + public DateTimeOffset DateTimeOffset { get; set; } + } [Fact] public async Task InterfaceProperty_Serialize() { diff --git a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs index a0fccb86..66f9e990 100644 --- a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs +++ b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs @@ -326,6 +326,9 @@ class DatesPoco { [ParquetTimestamp(useLogicalTimestamp: true)] public DateTime LogicalTimestampDate { get; set; } + [ParquetTimestamp(useLogicalTimestamp: true, isAdjustedToUtc: true)] + public DateTimeOffset DateTimeOffset { get; set; } + [ParquetTimestamp] public DateTime? NullableTimestampDate { get; set; } @@ -394,6 +397,15 @@ public void Type_DateTime_LogicalTimestamp() { Assert.True(df is DateTimeDataField); Assert.Equal(DateTimeFormat.Timestamp, ((DateTimeDataField)df).DateTimeFormat); } + + [Fact] + public void Type_DateTimeOffset() { + ParquetSchema s = typeof(DatesPoco).GetParquetSchema(true); + + DataField df = s.FindDataField(nameof(DatesPoco.DateTimeOffset)); + Assert.True(df is DateTimeOffsetDataField); + Assert.Equal(DateTimeTimeUnit.Millis, ((DateTimeOffsetDataField)df).Unit); + } [Fact] public void Type_DateTime_TimestampNullable() { diff --git a/src/Parquet.Test/Types/EndToEndTypeTest.cs b/src/Parquet.Test/Types/EndToEndTypeTest.cs index 6f77f48f..f6c7c6c7 100644 --- a/src/Parquet.Test/Types/EndToEndTypeTest.cs +++ b/src/Parquet.Test/Types/EndToEndTypeTest.cs @@ -51,6 +51,7 @@ public class EndToEndTypeTest : TestBase { ["dateDateAndTime local kind"] = (new DateTimeDataField("dateDateAndTime unknown kind", DateTimeFormat.DateAndTime), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), ["timestamp utc kind"] = (new DateTimeDataField("timestamp utc kind", DateTimeFormat.Timestamp, true), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Utc)), ["timestamp local kind"] = (new DateTimeDataField("timestamp local kind", DateTimeFormat.Timestamp, false), new DateTime(2020, 06, 10, 11, 12, 13, DateTimeKind.Local)), + ["datetimeoffset utc kind"] = (new DateTimeOffsetDataField("datetimeoffset utc kind", DateTimeTimeUnit.Millis), new DateTimeOffset(2020, 06, 10, 11, 12, 13, TimeSpan.Zero)), // don't want any excess info in the offset INT32 doesn't contain or care about this data ["dateDate"] = (new DateTimeDataField("dateDate", DateTimeFormat.Date), DateTime.UtcNow.RoundToDay()), #if !NETCOREAPP3_1 @@ -128,6 +129,7 @@ public class EndToEndTypeTest : TestBase { [InlineData("dateDateAndTime local kind")] [InlineData("timestamp utc kind")] [InlineData("timestamp local kind")] + [InlineData("datetimeoffset utc kind")] [InlineData("dateDate")] #if !NETCOREAPP3_1 [InlineData("dateOnly")] @@ -191,6 +193,10 @@ public async Task Type_writes_and_reads_end_to_end(string name) { ? DateTime.SpecifyKind(dtExpected, DateTimeKind.Utc) // assumes value is UTC : dtExpected.ToUniversalTime(); equal = dtActual.Equals(dtExpected); + } else if(actual.GetType() == typeof(DateTimeOffset)) { + var dtActual = (DateTimeOffset)actual; + var dtExpected = (DateTimeOffset)input.expectedValue!; + equal = dtActual.Equals(dtExpected); } else { equal = actual.Equals(input.expectedValue); } diff --git a/src/Parquet/Encodings/ParquetPlainEncoder.cs b/src/Parquet/Encodings/ParquetPlainEncoder.cs index ca5e2a6b..b9a71d5e 100644 --- a/src/Parquet/Encodings/ParquetPlainEncoder.cs +++ b/src/Parquet/Encodings/ParquetPlainEncoder.cs @@ -97,6 +97,11 @@ public static void Encode( } else if(t == typeof(byte[][])) { Span span = ((byte[][])data).AsSpan(offset, count); Encode(span, destination); + } else if(t == typeof(DateTimeOffset[])) { + Span span = ((DateTimeOffset[])data).AsSpan(offset, count); + Encode(span, destination, tse); + if(stats != null) + FillStats(span, stats); } else if(t == typeof(DateTime[])) { Span span = ((DateTime[])data).AsSpan(offset, count); Encode(span, destination, tse); @@ -185,6 +190,9 @@ public static void Decode( } else if(t == typeof(byte[][])) { Span span = ((byte[][])data).AsSpan(offset, count); elementsRead = Decode(source, span, tse); + } else if(t == typeof(DateTimeOffset[])) { + Span span = ((DateTimeOffset[])data).AsSpan(offset, count); + elementsRead = Decode(source, span, tse); } else if(t == typeof(DateTime[])) { Span span = ((DateTime[])data).AsSpan(offset, count); elementsRead = Decode(source, span, tse); @@ -264,6 +272,8 @@ public static bool TryEncode(object? value, SchemaElement tse, out byte[]? resul } else if(t == typeof(byte[])) { result = (byte[])value; return true; + } else if(t == typeof(DateTimeOffset)) { + return TryEncode((DateTimeOffset)value, tse, out result); } else if(t == typeof(DateTime)) return TryEncode((DateTime)value, tse, out result); #if NET6_0_OR_GREATER @@ -353,6 +363,18 @@ private static decimal TryDecodeDecimal(byte[] value, SchemaElement tse) { } } + private static bool TryEncode(DateTimeOffset value, SchemaElement tse, out byte[] result) { + switch(tse.Type) { + case TType.INT64: + long unixTime = value.ToUniversalTime().ToUnixTimeMilliseconds(); + result = BitConverter.GetBytes(unixTime); + return true; + default: + throw new InvalidDataException($"data type '{tse.Type}' does not represent any date types"); + + } + } + private static bool TryEncode(DateTime value, SchemaElement tse, out byte[] result) { switch(tse.Type) { case TType.INT32: @@ -812,6 +834,40 @@ public static int Decode(Span source, Span data, SchemaElement tse return read; } + public static void Encode(ReadOnlySpan data, Stream destination, SchemaElement tse) { + switch(tse.Type) { + case TType.INT64: + if(tse.LogicalType?.TIMESTAMP is not null) { + foreach(DateTimeOffset element in data) { + if(tse.LogicalType.TIMESTAMP.Unit.MILLIS is not null) { + // Always convert to UTC + long unixTime = element.ToUniversalTime().DateTime.ToUnixMilliseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); +#if NET7_0_OR_GREATER + } else if (tse.LogicalType.TIMESTAMP.Unit.MICROS is not null) { + long unixTime = element.ToUniversalTime().DateTime.ToUnixMicroseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); + } else if (tse.LogicalType.TIMESTAMP.Unit.NANOS is not null) { + long unixTime = element.ToUniversalTime().DateTime.ToUnixNanoseconds(); + byte[] raw = BitConverter.GetBytes(unixTime); + destination.Write(raw, 0, raw.Length); +#endif + } else { + throw new ParquetException($"Unexpected TimeUnit: {tse.LogicalType.TIMESTAMP.Unit}"); + } + } + } else { + throw new ArgumentException($"invalid converted type: {tse.ConvertedType}"); + } + break; + default: + throw new InvalidDataException($"data type '{tse.Type}' does not represent any date types"); + + } + } + public static void Encode(ReadOnlySpan data, Stream destination, SchemaElement tse) { switch(tse.Type) { @@ -900,6 +956,45 @@ public static void Encode(ReadOnlySpan data, Stream destination, Schem } } #endif + public static int Decode(Span source, Span data, SchemaElement tse) { + switch(tse.Type) { + case TType.INT64: + long[] longs = ArrayPool.Shared.Rent(data.Length); + try { + int longsRead = Decode(source, longs.AsSpan(0, data.Length)); + if(tse.LogicalType?.TIMESTAMP is not null) { + if(!tse.LogicalType.TIMESTAMP.IsAdjustedToUTC) { + throw new ParquetException("Cannot decode Timestamp to DateTimeOffset with IsAdjustedToUTC set to false"); + } + + for(int i = 0; i < longsRead; i++) { + if(tse.LogicalType.TIMESTAMP.Unit.MILLIS is not null) { + data[i] = DateTimeOffset.FromUnixTimeMilliseconds(longs[i]); + } else if(tse.LogicalType.TIMESTAMP.Unit.MICROS is not null) { + long lv = longs[i]; + long microseconds = lv % 1000; + lv /= 1000; + data[i] = DateTimeOffset.FromUnixTimeMilliseconds(lv).AddTicks(microseconds * 10); + } else if(tse.LogicalType.TIMESTAMP.Unit.NANOS is not null) { + long lv = longs[i]; + long nanoseconds = lv % 1000000; + lv /= 1000000;; + data[i] = DateTimeOffset.FromUnixTimeMilliseconds(lv).AddTicks(nanoseconds / 100); // 1 tick = 100 nanoseconds; + } else { + throw new ParquetException($"Unexpected TimeUnit: {tse.LogicalType.TIMESTAMP.Unit}"); + } + } + } else { + throw new ArgumentException(""); + } + return longsRead; + } finally { + ArrayPool.Shared.Return(longs); + } + default: + throw new NotSupportedException(); + } + } public static int Decode(Span source, Span data, SchemaElement tse) { switch(tse.Type) { @@ -1300,6 +1395,12 @@ public static void FillStats(ReadOnlySpan data, DataColumnStatistics stat stats.MaxValue = max; } + public static void FillStats(ReadOnlySpan data, DataColumnStatistics stats) { + data.MinMax(out DateTimeOffset min, out DateTimeOffset max); + stats.MinValue = min; + stats.MaxValue = max; + } + public static void FillStats(ReadOnlySpan data, DataColumnStatistics stats) { data.MinMax(out DateTime min, out DateTime max); stats.MinValue = min; diff --git a/src/Parquet/Encodings/SchemaEncoder.cs b/src/Parquet/Encodings/SchemaEncoder.cs index 866ab8e2..ba640318 100644 --- a/src/Parquet/Encodings/SchemaEncoder.cs +++ b/src/Parquet/Encodings/SchemaEncoder.cs @@ -24,6 +24,7 @@ static class SchemaEncoder { typeof(decimal), typeof(BigInteger), typeof(DateTime), + typeof(DateTimeOffset), #if NET6_0_OR_GREATER typeof(DateOnly), typeof(TimeOnly), @@ -472,7 +473,8 @@ public static SchemaElement Encode(DataField field) { break; case DateTimeFormat.Timestamp: tse.Type = Type.INT64; - tse.LogicalType = new LogicalType { TIMESTAMP = new TimestampType { + tse.LogicalType = new LogicalType { + TIMESTAMP = new TimestampType { IsAdjustedToUTC = dfDateTime.IsAdjustedToUTC, Unit = dfDateTime.Unit switch { DateTimeTimeUnit.Millis => new TimeUnit { @@ -493,9 +495,34 @@ public static SchemaElement Encode(DataField field) { tse.Type = Type.INT96; break; } + } else if(field is DateTimeOffsetDataField) { + throw new ParquetException($"Unexpected DataField: {field.GetType()} should be DateTimeDataField"); } else { tse.Type = Type.INT96; } + } else if(st == typeof(DateTimeOffset)) { + if(field is DateTimeOffsetDataField dtDateTimeOffset) { + tse.Type = Type.INT64; + tse.LogicalType = new LogicalType { + TIMESTAMP = new TimestampType { + IsAdjustedToUTC = dtDateTimeOffset.IsAdjustedToUTC, + Unit = dtDateTimeOffset.Unit switch { + DateTimeTimeUnit.Millis => new TimeUnit { + MILLIS = new MilliSeconds(), + }, + DateTimeTimeUnit.Micros => new TimeUnit { + MICROS = new MicroSeconds(), + }, + DateTimeTimeUnit.Nanos => new TimeUnit { + NANOS = new NanoSeconds(), + }, + _ => throw new ParquetException($"Unexpected TimeUnit: {dtDateTimeOffset.Unit}") + } + } + }; + } else { + throw new ParquetException($"Unexpected DataField: {field.GetType()} should be DateTimeOffsetDataField"); + } #if NET6_0_OR_GREATER } else if(st == typeof(DateOnly)) { // DateOnly diff --git a/src/Parquet/Extensions/SpanExtensions.cs b/src/Parquet/Extensions/SpanExtensions.cs index 36ae9f8a..4817bce9 100644 --- a/src/Parquet/Extensions/SpanExtensions.cs +++ b/src/Parquet/Extensions/SpanExtensions.cs @@ -175,6 +175,17 @@ public static void MinMax(this ReadOnlySpan span, out float min, out floa } } + public static void MinMax(this ReadOnlySpan span, out DateTimeOffset min, out DateTimeOffset max) { + min = span.IsEmpty ? default(DateTimeOffset) : span[0]; + max = min; + foreach(DateTimeOffset i in span) { + if(i < min) + min = i; + if(i > max) + max = i; + } + } + public static void MinMax(this ReadOnlySpan span, out DateTime min, out DateTime max) { min = span.IsEmpty ? default(DateTime) : span[0]; max = min; diff --git a/src/Parquet/Schema/DataField.cs b/src/Parquet/Schema/DataField.cs index 7910948c..31685a81 100644 --- a/src/Parquet/Schema/DataField.cs +++ b/src/Parquet/Schema/DataField.cs @@ -63,13 +63,8 @@ public DataField(string name, Type clrType, bool? isNullable = null, bool? isArr Discover(clrType, isCompiledWithNullable ?? true, out Type baseType, out bool discIsArray, out bool discIsNullable); ClrType = baseType; - if(!SchemaEncoder.IsSupported(ClrType)) { - if(baseType == typeof(DateTimeOffset)) { - throw new NotSupportedException($"{nameof(DateTimeOffset)} support was dropped due to numerous ambiguity issues, please use {nameof(DateTime)} from now on."); - } - else { - throw new NotSupportedException($"type {clrType} is not supported"); - } + if(!SchemaEncoder.IsSupported(ClrType)){ + throw new NotSupportedException($"type {clrType} is not supported"); } IsNullable = isNullable ?? discIsNullable; @@ -158,13 +153,20 @@ public override bool Equals(object? obj) { return false; return base.Equals(obj) && - BaseClrType == other.BaseClrType && + BaseClrTypeCompatible(other) && IsNullable == other.IsNullable && IsArray == other.IsArray; } /// public override int GetHashCode() => base.GetHashCode(); + + /// + /// + /// + /// + /// + protected virtual bool BaseClrTypeCompatible(DataField other) => this.BaseClrType == other.BaseClrType; #region [ Type Resolution ] diff --git a/src/Parquet/Schema/DateTimeOffsetDataField.cs b/src/Parquet/Schema/DateTimeOffsetDataField.cs new file mode 100644 index 00000000..3d5baa28 --- /dev/null +++ b/src/Parquet/Schema/DateTimeOffsetDataField.cs @@ -0,0 +1,45 @@ +using System; + +namespace Parquet.Schema { + /// + /// Schema element for which allows to specify precision + /// + public class DateTimeOffsetDataField : DataField { + /// + /// IsAdjustedToUTC + /// + public bool IsAdjustedToUTC => true; + + /// + /// TimeUnit + /// + public DateTimeTimeUnit Unit { get; } + + + /// + /// Initializes a new instance of the class. + /// + /// The name. + /// + /// + /// + /// When set, uses this property to get the field's data. When not set, uses the property that matches the name parameter. + public DateTimeOffsetDataField(string name, DateTimeTimeUnit? unit = null, bool? isNullable = null, bool? isArray = null, string? propertyName = null) + : base(name, typeof(DateTimeOffset), isNullable, isArray, propertyName) { + Unit = unit ?? DateTimeTimeUnit.Millis; + } + + /// + /// + /// + /// + /// + protected override bool BaseClrTypeCompatible(DataField other) { + if(other is DateTimeDataField) { + return true; + } + + return base.BaseClrTypeCompatible(other); + } + } +} \ No newline at end of file diff --git a/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs b/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs index 18c6be28..6847c4ba 100644 --- a/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs +++ b/src/Parquet/Serialization/Attributes/ParquetTimestampAttribute.cs @@ -35,9 +35,11 @@ public class ParquetTimestampAttribute : Attribute { /// /// /// - public ParquetTimestampAttribute(ParquetTimestampResolution resolution = ParquetTimestampResolution.Milliseconds, bool useLogicalTimestamp = false) { + /// + public ParquetTimestampAttribute(ParquetTimestampResolution resolution = ParquetTimestampResolution.Milliseconds, bool useLogicalTimestamp = false, bool isAdjustedToUtc = false) { Resolution = resolution; UseLogicalTimestamp = useLogicalTimestamp; + IsAdjustedToUTC = isAdjustedToUtc; } /// diff --git a/src/Parquet/Serialization/TypeExtensions.cs b/src/Parquet/Serialization/TypeExtensions.cs index 35e321c9..973b11c8 100644 --- a/src/Parquet/Serialization/TypeExtensions.cs +++ b/src/Parquet/Serialization/TypeExtensions.cs @@ -194,6 +194,11 @@ private static Field ConstructDataField(string name, string propertyName, Type t isAdjustedToUTC: tsa == null ? true : tsa.IsAdjustedToUTC, unit: tsa?.Resolution.Convert(), isNullable: t == typeof(DateTime?), null, propertyName); + } else if (t == typeof(DateTimeOffset) || t == typeof(DateTimeOffset?)) { + ParquetTimestampAttribute? tsa = member?.TimestampAttribute; + r = new DateTimeOffsetDataField(name, + unit: tsa?.Resolution.Convert(), + isNullable: t == typeof(DateTimeOffset?), null, propertyName); } else if(t == typeof(TimeSpan) || t == typeof(TimeSpan?)) { r = new TimeSpanDataField(name, member?.MicroSecondsTimeAttribute == null