Skip to content

Commit

Permalink
Merge pull request #6 from CommunityHiQ/ISSUE_5
Browse files Browse the repository at this point in the history
Feature: Set timezone for datetime values
  • Loading branch information
Svenskapojkarna authored Apr 21, 2023
2 parents 8332699 + a7cb683 commit 2b46728
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 13 deletions.
65 changes: 55 additions & 10 deletions Frends.Community.Apache.Tests/UnitTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,12 @@ public void WriteParquetFile()
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '045c6a617a1d37e0a1b464ccdeea2979'";
Assert.IsTrue(hash == "045c6a617a1d37e0a1b464ccdeea2979", errMessage);
Assert.AreEqual(hash, "045c6a617a1d37e0a1b464ccdeea2979", errMessage);
}
else
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '3d6f72d1664b6a4040d2f12457264060'";
Assert.IsTrue(hash == "3d6f72d1664b6a4040d2f12457264060", errMessage);
Assert.AreEqual(hash, "3d6f72d1664b6a4040d2f12457264060", errMessage);
}
}

Expand Down Expand Up @@ -217,12 +217,12 @@ public void WriteParquetFileNoNulls()
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '6c80e7c86c8adf39b8544f7bc90724c8'";
Assert.IsTrue(hash == "6c80e7c86c8adf39b8544f7bc90724c8", errMessage);
Assert.AreEqual(hash, "6c80e7c86c8adf39b8544f7bc90724c8", errMessage);
}
else
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: 'd5dcfc43ecd64da5f5013dab3095b777'";
Assert.IsTrue(hash == "d5dcfc43ecd64da5f5013dab3095b777", errMessage);
Assert.AreEqual(hash, "d5dcfc43ecd64da5f5013dab3095b777", errMessage);
}
}

Expand Down Expand Up @@ -263,12 +263,12 @@ public void WriteParquetFileQuotes()
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '86692194196efecc823d48384bd2a5a5'";
Assert.IsTrue(hash == "86692194196efecc823d48384bd2a5a5", errMessage);
Assert.AreEqual(hash, "86692194196efecc823d48384bd2a5a5", errMessage);
}
else
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '94e1bfe7bf71d94d5bd52f2de2af658b'";
Assert.IsTrue(hash == "94e1bfe7bf71d94d5bd52f2de2af658b", errMessage);
Assert.AreEqual(hash, "94e1bfe7bf71d94d5bd52f2de2af658b", errMessage);
}
}

Expand Down Expand Up @@ -480,15 +480,60 @@ public void WriteParquetFileCountRows()
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '045c6a617a1d37e0a1b464ccdeea2979'";
Assert.IsTrue(hash == "045c6a617a1d37e0a1b464ccdeea2979", errMessage);
Assert.AreEqual(hash, "045c6a617a1d37e0a1b464ccdeea2979", errMessage);
}
else
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '3d6f72d1664b6a4040d2f12457264060'";
Assert.IsTrue(hash == "3d6f72d1664b6a4040d2f12457264060", errMessage);
Assert.AreEqual(hash, "3d6f72d1664b6a4040d2f12457264060", errMessage);
}
}

/// <summary>
/// Test case for selecting timezone for datetime.
/// </summary>
[Test]
public void WriteParquetFileDatetime()
{
TestTools.RemoveOutputFile(_outputFileName);

var options = new WriteCSVOptions()
{
CsvDelimiter = ";",
FileEncoding = FileEncoding.UTF8,
EnableBom = false,
EncodingInString = ""
};

var poptions = new WriteParquetOptions()
{
ParquetRowGroupSize = 5000,
ParquetCompressionMethod = CompressionType.Gzip,
Timezone = Timezone.FLEStandardTime
};

var input = new WriteInput()
{
CsvFileName = _inputCsvFileName,
OuputFileName = _outputFileName,
ThrowExceptionOnErrorResponse = true,
Schema = _commonSchema
};

ApacheTasks.ConvertCsvToParquet(input, options, poptions, new CancellationToken());

var hash = TestTools.MD5Hash(_outputFileName);
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '045c6a617a1d37e0a1b464ccdeea2979'";
Assert.AreEqual(hash, "045c6a617a1d37e0a1b464ccdeea2979", errMessage);
}
else
{
var errMessage = $"File checksum doesn't match. Generated checksum: '{hash}' differs the expected checksum: '3d6f72d1664b6a4040d2f12457264060'";
Assert.AreEqual(hash, "3d6f72d1664b6a4040d2f12457264060", errMessage);
}
}

/// <summary>
/// Simple csv -> parquet test case with large group size.
Expand Down Expand Up @@ -528,12 +573,12 @@ public void WriteParquetFileMaxMemory()
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var errMessage = $"File checksum didn't match. Generated checksum: '{hash}' differs the expected checksum: '936a383c5b5f5665114f48c804f52bd3'";
Assert.IsTrue(hash == "936a383c5b5f5665114f48c804f52bd3", errMessage);
Assert.AreEqual(hash, "936a383c5b5f5665114f48c804f52bd3", errMessage);
}
else
{
var errMessage = $"File checksum didn't match. Generated checksum: '{hash}' differs the expected checksum: 'd214884cf6a6596cbc69a174bdd60805'";
Assert.IsTrue(hash == "d214884cf6a6596cbc69a174bdd60805", errMessage);
Assert.AreEqual(hash, "d214884cf6a6596cbc69a174bdd60805", errMessage);
}
}

Expand Down
61 changes: 59 additions & 2 deletions Frends.Community.Apache/ApacheTasks.cs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,37 @@ public static WriteResult ConvertCsvToParquet([PropertyTab] WriteInput input, [P
((bool?[])csvColumns[i])[dataIndex] = Writer.GetBooleanValueNullable(csv.GetField(i));
break;
case DataType.DateTimeOffset:
((DateTimeOffset?[])csvColumns[i])[dataIndex] = Writer.GetDateTimeOffsetValueNullable(csv.GetField(i), config.GetConfigValue(dataFields[i].Name));
var date = Writer.GetDateTimeOffsetValueNullable(csv.GetField(i), config.GetConfigValue(dataFields[i].Name));
if (date == null) ((DateTimeOffset?[])csvColumns[i])[dataIndex] = date;
else
{
string timezone = "";

switch (parquetOptions.Timezone)
{
case Timezone.FLEStandardTime:
timezone = "FLE Standard Time";
break;
case Timezone.CentralEuropeStandardTime:
timezone = "Central Europe Standard Time";
break;
case Timezone.Other:
if (string.IsNullOrEmpty(parquetOptions.OtherTimezone))
{
timezone = "GMT Standard Time";
}
else
{
timezone = parquetOptions.OtherTimezone;
}
break;
default:
timezone = "GMT Standard Time";
break;
}
TimeZoneInfo cetInfo = TimeZoneInfo.FindSystemTimeZoneById(timezone);
((DateTimeOffset?[])csvColumns[i])[dataIndex] = TimeZoneInfo.ConvertTime((DateTimeOffset)date, cetInfo);
}
break;
case DataType.Decimal:
((decimal?[])csvColumns[i])[dataIndex] = Writer.GetDecimalValueNullable(csv.GetField(i), config.GetConfigValue(dataFields[i].Name));
Expand Down Expand Up @@ -187,7 +217,34 @@ public static WriteResult ConvertCsvToParquet([PropertyTab] WriteInput input, [P
((bool[])csvColumns[i])[dataIndex] = Writer.GetBooleanValue(csv.GetField(i));
break;
case DataType.DateTimeOffset:
((DateTimeOffset[])csvColumns[i])[dataIndex] = Writer.GetDateTimeOffsetValue(csv.GetField(i), config.GetConfigValue(dataFields[i].Name));
string timezone = "";

switch (parquetOptions.Timezone)
{
case Timezone.FLEStandardTime:
timezone = "FLE Standard Time";
break;
case Timezone.CentralEuropeStandardTime:
timezone = "Central Europe Standard Time";
break;
case Timezone.Other:
if (string.IsNullOrEmpty(parquetOptions.OtherTimezone))
{
timezone = "GMT Standard Time";
}
else
{
timezone = parquetOptions.OtherTimezone;
}
break;
default:
timezone = "GMT Standard Time";
break;
}
TimeZoneInfo cetInfo = TimeZoneInfo.FindSystemTimeZoneById(timezone);

var date = Writer.GetDateTimeOffsetValue(csv.GetField(i), config.GetConfigValue(dataFields[i].Name));
((DateTimeOffset[])csvColumns[i])[dataIndex] = TimeZoneInfo.ConvertTime((DateTimeOffset)date, cetInfo);
break;
case DataType.Decimal:
((decimal[])csvColumns[i])[dataIndex] = decimal.Parse(csv.GetField(i), Writer.GetCultureInfo(config.GetConfigValue(dataFields[i].Name)));
Expand Down
14 changes: 14 additions & 0 deletions Frends.Community.Apache/Definitions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public enum FileEncoding { UTF8, ANSI, ASCII, Unicode, Other }

public enum CompressionType { Gzip, Snappy, None }

public enum Timezone { GMTStandardTime, CentralEuropeStandardTime, FLEStandardTime, Other }

public class WriteParquetOptions
{
/// <summary>
Expand All @@ -66,6 +68,18 @@ public class WriteParquetOptions
/// </summary>
[DefaultValue(false)]
public bool CountRowsBeforeProcessing { get; set; } = false;

/// <summary>
/// Timezone for datetime values.
/// </summary>
[DefaultValue(Timezone.GMTStandardTime)]
public Timezone Timezone { get; set; } = Timezone.GMTStandardTime;

/// <summary>
/// Timezone for other timezone value. Full list on: https://learn.microsoft.com/en-us/windows-hardware/manufacture/desktop/default-time-zones?view=windows-11
/// </summary>
[UIHint(nameof(Timezone), "", Timezone.Other)]
public string OtherTimezone { get; set; }
}

public class WriteCSVOptions
Expand Down
2 changes: 1 addition & 1 deletion Frends.Community.Apache/Frends.Community.Apache.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<IncludeSource>true</IncludeSource>
<PackageTags>Frends</PackageTags>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<Version>1.0.3</Version>
<Version>1.1.0</Version>
<LangVersion>latest</LangVersion>
</PropertyGroup>

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Note: Decimals, floats and doubles have "fi-FI" default culture. Decimal separat
| Parquet row group size | number | Parquet files row group size. Batch size should be large enough because of perfomance later. | 5000 |
| Parquet compression method | Enum | Parquet's compression level. GZip (smallest filesize) / Snappy / None | Gzip |
| Count rows before processing | bool | Count CSV file rows before processing. If row count if smaller than Parquet row group size, decrease group size. Because this operation reads CSV file before processing, CSV file is processed two times. | false |
| Timezone | Enum | Set timezone for datetime value. GTMStandardTime / CentralEuropeStandardTime / FLEStandardTime / Other | GTMStandardTime |
| Other Timezone | string | Set timezone other value. | Greenwich Standard Time |

### Returns

Expand Down Expand Up @@ -105,3 +107,4 @@ NOTE: Be sure to merge the latest from "upstream" before making a pull request!
| 1.0.1 | Multitarget conversion and CI |
| 1.0.2 | Fixed issue #2: Typos/errors are difficult to find in configuration |
| 1.0.3 | Dependency update: CsvHelper 27.1.1 to 30.0.1, Parquet.Net 3.9.0 to 3.10.0 |
| 1.1.0 | Option for setting timezone for datetime values |

0 comments on commit 2b46728

Please sign in to comment.