Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataframe csv datetime #5834

Merged
merged 6 commits into from
Jun 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ private static Type GuessKind(int col, List<string[]> read)
++nbline;
continue;
}
bool dateParse = DateTime.TryParse(val, out DateTime dateResult);
if (dateParse)
{
res = DetermineType(nbline == 0, typeof(DateTime), res);
++nbline;
continue;
}

res = DetermineType(nbline == 0, typeof(string), res);
++nbline;
Expand All @@ -71,6 +78,8 @@ private static Type MaxKind(Type a, Type b)
return typeof(float);
if (a == typeof(bool) || b == typeof(bool))
return typeof(bool);
if (a == typeof(DateTime) || b == typeof(DateTime))
return typeof(DateTime);
return typeof(string);
}

Expand Down Expand Up @@ -165,6 +174,10 @@ private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int
{
ret = new UInt16DataFrameColumn(GetColumnName(columnNames, columnIndex));
}
else if (kind == typeof(DateTime))
{
ret = new PrimitiveDataFrameColumn<DateTime>(GetColumnName(columnNames, columnIndex));
}
else
{
throw new NotSupportedException(nameof(kind));
Expand Down
1 change: 1 addition & 0 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
if (value != null)
{
value = Convert.ChangeType(value, column.DataType);

if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
Expand Down
314 changes: 314 additions & 0 deletions src/Microsoft.Data.Analysis/DateTimeComputation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.Data.Analysis
{
internal class DateTimeComputation : IPrimitiveColumnComputation<DateTime>
{
public void Abs(PrimitiveColumnContainer<DateTime> column)
{
throw new NotSupportedException();
}

public void All(PrimitiveColumnContainer<DateTime> column, out bool ret)
{
throw new NotSupportedException();
}

public void Any(PrimitiveColumnContainer<DateTime> column, out bool ret)
{
throw new NotSupportedException();
}

public void CumulativeMax(PrimitiveColumnContainer<DateTime> column)
{
var ret = column.Buffers[0].ReadOnlySpan[0];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if it is empty?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had the same thought when I first saw the PR, so I looked at what the other columns are doing. None of them check for empty here. It's not high priority IMO, so I'm thinking we can fix that for all the columns in a separate PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you log an issue for this? So we remember to do it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(buffer);
var mutableSpan = mutableBuffer.Span;
var readOnlySpan = buffer.ReadOnlySpan;
for (int i = 0; i < readOnlySpan.Length; i++)
{
var val = readOnlySpan[i];

if (val > ret)
{
ret = val;
}

mutableSpan[i] = ret;
}
column.Buffers[b] = mutableBuffer;
}
}

public void CumulativeMax(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows)
{
var ret = default(DateTime);
var mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[0]);
var span = mutableBuffer.Span;
long minRange = 0;
long maxRange = ReadOnlyDataFrameBuffer<DateTime>.MaxCapacity;
long maxCapacity = maxRange;
IEnumerator<long> enumerator = rows.GetEnumerator();
if (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[bufferIndex]);
span = mutableBuffer.Span;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;
ret = span[(int)row];
}

while (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[bufferIndex]);
span = mutableBuffer.Span;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;

var val = span[(int)row];

if (val > ret)
{
ret = val;
}

span[(int)row] = ret;
}
}

public void CumulativeMin(PrimitiveColumnContainer<DateTime> column)
{
var ret = column.Buffers[0].ReadOnlySpan[0];
for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(buffer);
var mutableSpan = mutableBuffer.Span;
var readOnlySpan = buffer.ReadOnlySpan;
for (int i = 0; i < readOnlySpan.Length; i++)
{
var val = readOnlySpan[i];

if (val < ret)
{
ret = val;
}

mutableSpan[i] = ret;
}
column.Buffers[b] = mutableBuffer;
}
}

public void CumulativeMin(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows)
{
var ret = default(DateTime);
var mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[0]);
var span = mutableBuffer.Span;
long minRange = 0;
long maxRange = ReadOnlyDataFrameBuffer<DateTime>.MaxCapacity;
long maxCapacity = maxRange;
IEnumerator<long> enumerator = rows.GetEnumerator();
if (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[bufferIndex]);
span = mutableBuffer.Span;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;
ret = span[(int)row];
}

while (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
mutableBuffer = DataFrameBuffer<DateTime>.GetMutableBuffer(column.Buffers[bufferIndex]);
span = mutableBuffer.Span;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;

var val = span[(int)row];

if (val < ret)
{
ret = val;
}

span[(int)row] = ret;
}
}

public void CumulativeProduct(PrimitiveColumnContainer<DateTime> column)
{
throw new NotSupportedException();
}

public void CumulativeProduct(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows)
{
throw new NotSupportedException();
}

public void CumulativeSum(PrimitiveColumnContainer<DateTime> column)
{
throw new NotSupportedException();
}

public void CumulativeSum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows)
{
throw new NotSupportedException();
}

public void Max(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
{
ret = column.Buffers[0].ReadOnlySpan[0];
for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var readOnlySpan = buffer.ReadOnlySpan;
for (int i = 0; i < readOnlySpan.Length; i++)
{
var val = readOnlySpan[i];

if (val > ret)
{
ret = val;
}
}
}
}

public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
{
ret = default;
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
long minRange = 0;
long maxRange = ReadOnlyDataFrameBuffer<DateTime>.MaxCapacity;
long maxCapacity = maxRange;
IEnumerator<long> enumerator = rows.GetEnumerator();
while (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
readOnlySpan = column.Buffers[bufferIndex].ReadOnlySpan;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;

var val = readOnlySpan[(int)row];

if (val > ret)
{
ret = val;
}
}
}

public void Min(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
{
ret = column.Buffers[0].ReadOnlySpan[0];
for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var readOnlySpan = buffer.ReadOnlySpan;
for (int i = 0; i < readOnlySpan.Length; i++)
{
var val = readOnlySpan[i];

if (val < ret)
{
ret = val;
}
}
}
}

public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
{
ret = default;
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
long minRange = 0;
long maxRange = ReadOnlyDataFrameBuffer<DateTime>.MaxCapacity;
long maxCapacity = maxRange;
IEnumerator<long> enumerator = rows.GetEnumerator();
while (enumerator.MoveNext())
{
long row = enumerator.Current;
if (row < minRange || row >= maxRange)
{
int bufferIndex = (int)(row / maxCapacity);
readOnlySpan = column.Buffers[bufferIndex].ReadOnlySpan;
minRange = checked(bufferIndex * maxCapacity);
maxRange = checked((bufferIndex + 1) * maxCapacity);
}
row -= minRange;

var val = readOnlySpan[(int)row];

if (val < ret)
{
ret = val;
}
}
}

public void Product(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
{
throw new NotSupportedException();
}

public void Product(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
{
throw new NotSupportedException();
}

public void Sum(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
{
throw new NotSupportedException();
}

public void Sum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
{
throw new NotSupportedException();
}

public void Round(PrimitiveColumnContainer<DateTime> column)
{
throw new NotSupportedException();
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public static IPrimitiveColumnComputation<T> GetComputation<T>()
{
return (IPrimitiveColumnComputation<T>)new UShortComputation();
}
else if (typeof(T) == typeof(DateTime))
{
return (IPrimitiveColumnComputation<T>)new DateTimeComputation();
}

throw new NotSupportedException();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ namespace Microsoft.Data.Analysis
return (IPrimitiveColumnComputation<T>)new <#=type.ClassPrefix#>Computation();
}
<# } #>
else if (typeof(T) == typeof(DateTime))
{
return (IPrimitiveColumnComputation<T>)new DateTimeComputation();
}

throw new NotSupportedException();
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/Strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading