Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/apis [POC] #70

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 109 additions & 3 deletions RecordParser.Benchmark/VariableLengthReaderBenchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
using FlatFiles;
using FlatFiles.TypeMapping;
using RecordParser.Builders.Reader;
using RecordParser.Engines.Reader;
using RecordParser.Extensions.FileReader;
using System;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using TinyCsvParser;
Expand Down Expand Up @@ -79,7 +81,7 @@ public async Task Read_VariableLength_RecordParser()
await ProcessCSVFile(parser.Parse);
}

[Benchmark]
// [Benchmark]
[Arguments(false, true)]
[Arguments(true, true)]
public void Read_VariableLength_FullQuoted_RecordParser_Parallel(bool parallel, bool quoted)
Expand Down Expand Up @@ -120,7 +122,7 @@ public void Read_VariableLength_FullQuoted_RecordParser_Parallel(bool parallel,
throw new Exception($"read {i} records but expected {LimitRecord}");
}

[Benchmark]
// [Benchmark]
[Arguments(true, true)]
[Arguments(true, false)]
[Arguments(false, true)]
Expand Down Expand Up @@ -163,7 +165,7 @@ public void Read_VariableLength_RecordParser_Parallel(bool parallel, bool quoted
throw new Exception($"read {i} records but expected {LimitRecord}");
}

[Benchmark]
// [Benchmark]
[Arguments(true, true)]
[Arguments(true, false)]
[Arguments(false, true)]
Expand Down Expand Up @@ -210,6 +212,110 @@ Person PersonFactory(Func<int, string> getColumnValue)
}
}

[Benchmark]
[Arguments(true, true)]
[Arguments(true, false)]
[Arguments(false, true)]
[Arguments(false, false)]
public void Read_VariableLength_RecordParser_Fast(bool parallel, bool quoted)
{
using var fileStream = File.OpenRead(PathSampleDataCSV);
using var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize);

var readOptions = new VariableLengthReaderOptions
{
HasHeader = false,
ParallelOptions = new() { Enabled = parallel },
ContainsQuotedFields = quoted,
};

var caches = Enumerable.Range(0, 10).Select(_ => new InternPool()).ToArray();
var cache2 = new InternPool();

var items = streamReader.GetRecordsFast(readOptions, ",", PersonFactory);

var i = 0;
foreach (var person in items)
{
if (i++ == LimitRecord) return;
}

if (i != LimitRecord)
throw new Exception($"read {i} records but expected {LimitRecord}");

Person PersonFactory(TextFindHelper finder, int index)
{
//if (parallel is false)
//{
return Parser(finder);
//}

//var cache = caches[index % caches.Length];
//lock (cache)
//{
// return Parser(finder, cache);
//}

static Person Parser(in TextFindHelper finder)
{
return new Person()
{
id = Guid.Parse(finder.GetField(0)),
name = finder.GetField(1).Trim().ToString(),
age = int.Parse(finder.GetField(2)),
birthday = DateTime.Parse(finder.GetField(3), CultureInfo.InvariantCulture),
gender = Enum.Parse<Gender>(finder.GetField(4)),
email = finder.GetField(5).Trim().ToString(),
children = bool.Parse(finder.GetField(7))
};
}
}
}

[Benchmark]
[Arguments(true, true)]
[Arguments(true, false)]
[Arguments(false, true)]
[Arguments(false, false)]
public void Read_VariableLength_RecordParser_Fast_2(bool parallel, bool quoted)
{
using var fileStream = File.OpenRead(PathSampleDataCSV);
using var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize);

var readOptions = new VariableLengthReaderRawOptions
{
HasHeader = false,
ParallelOptions = new() { Enabled = parallel },
ContainsQuotedFields = quoted,
ColumnCount = 8
};

var items = streamReader.GetRecordsFast2(readOptions, ",", PersonFactory);

var i = 0;
foreach (var person in items)
{
if (i++ == LimitRecord) return;
}

if (i != LimitRecord)
throw new Exception($"read {i} records but expected {LimitRecord}");

Person PersonFactory(Func<int, string> GetField)
{
return new Person()
{
id = Guid.Parse(GetField(0)),
name = GetField(1),
age = int.Parse(GetField(2)),
birthday = DateTime.Parse(GetField(3), CultureInfo.InvariantCulture),
gender = Enum.Parse<Gender>(GetField(4)),
email = GetField(5),
children = bool.Parse(GetField(7))
};
}
}

#if TEST_ALL
[Benchmark]
#endif
Expand Down
32 changes: 32 additions & 0 deletions RecordParser.Test/TextFindHelperTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
using FluentAssertions;
using RecordParser.Engines.Reader;
using Xunit;

namespace RecordParser.Test
{
public class TextFindHelperTest
{
[Fact]
public void TextFindHelper_GetField_Unordered()
{
// Arrage

var record = "foo bar baz ; 2020.05.23 ; 0123.45; LightBlue";
var finder = new TextFindHelper(record, ";", ('"', "\""));

// Act

var d = finder.GetField(3);
var c = finder.GetField(2);
var b = finder.GetField(1);
var a = finder.GetField(0);

// Assert

a.ToString().Should().Be("foo bar baz ");
b.ToString().Should().Be(" 2020.05.23 ");
c.ToString().Should().Be(" 0123.45");
d.ToString().Should().Be(" LightBlue");
}
}
}
27 changes: 24 additions & 3 deletions RecordParser/Engines/Reader/TextFindHelper.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
using System;
using RecordParser.Extensions.FileReader;
using System;
using System.Buffers;

namespace RecordParser.Engines.Reader
{
internal ref struct TextFindHelper
public ref struct TextFindHelper
{
private readonly ReadOnlySpan<char> line;
private readonly string delimiter;
Expand All @@ -16,6 +17,8 @@ internal ref struct TextFindHelper

private char[] buffer;

private Span<(int start, int count, bool quoted)> fields;

public TextFindHelper(ReadOnlySpan<char> source, string delimiter, (char ch, string str) quote)
{
this.line = source;
Expand All @@ -27,6 +30,7 @@ public TextFindHelper(ReadOnlySpan<char> source, string delimiter, (char ch, str
currentIndex = -1;
currentValue = default;
buffer = null;
fields = new (int start, int count, bool quoted)[50];
}

public void Dispose()
Expand All @@ -38,6 +42,20 @@ public void Dispose()
}
}

public ReadOnlySpan<char> GetField(int index)
{
while (currentIndex < index)
{
GetValue(currentIndex + 1);
}

var x = fields[index];
if (x.quoted)
throw new Exception("TODO");
else
return line.Slice(x.start, x.count);
}

public ReadOnlySpan<char> GetValue(int index)
{
if (index <= currentIndex)
Expand Down Expand Up @@ -71,7 +89,9 @@ private ReadOnlySpan<char> ParseChunk(bool match)

if (isQuotedField)
{
return ParseQuotedChuck(match);
var value = ParseQuotedChuck(match);
fields[currentIndex] = (scanned, position, true);
return value;
}

position = unlook.IndexOf(delimiter);
Expand All @@ -80,6 +100,7 @@ private ReadOnlySpan<char> ParseChunk(bool match)
position = line.Length - scanned;
}

fields[currentIndex] = (scanned, position, false);
return line.Slice(scanned, position);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
using RecordParser.Extensions.FileReader.RowReaders;
using RecordParser.Engines.Reader;
using RecordParser.Extensions.FileReader.RowReaders;
using RecordParser.Parsers;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using static RecordParser.Extensions.FileReader.ReaderCommon;

namespace RecordParser.Extensions.FileReader
Expand All @@ -17,6 +19,8 @@ public class VariableLengthReaderOptions

public static class VariableLengthReaderExtensions
{
// maybe to remove reader parameter and adds what we need from it
// inside options parameter
public static IEnumerable<T> GetRecords<T>(this TextReader stream, IVariableLengthReader<T> reader, VariableLengthReaderOptions options)
{
Func<IFL> func = options.ContainsQuotedFields
Expand All @@ -30,5 +34,89 @@ public static IEnumerable<T> GetRecords<T>(this TextReader stream, IVariableLeng
? GetRecordsParallel(parser, func, options.HasHeader, parallelOptions)
: GetRecordsSequential(parser, func, options.HasHeader);
}

// new overload that returns record itself
public static IEnumerable<ReadOnlyMemory<char>> GetRecords(this TextReader stream, VariableLengthReaderOptions options, string separator)
{
Func<IFL> func = options.ContainsQuotedFields
? () => new RowByQuote(stream, Length, separator)
: () => new RowByLine(stream, Length);

var parser = (ReadOnlyMemory<char> memory, int i) => memory;
var parallelOptions = options.ParallelOptions ?? new();

return parallelOptions.Enabled
? GetRecordsParallel(parser, func, options.HasHeader, parallelOptions)
: GetRecordsSequential(parser, func, options.HasHeader);
}


// new overload that gives something like 'finder.GetField' to user do whatever he whats
// what about to wrapper TextFindHelper with another struct that only wrappers 'finder.GetField'?
public static IEnumerable<T> GetRecordsFast<T>(this TextReader stream, VariableLengthReaderOptions options, string separator, Parse<T> getValue)
{
Func<IFL> func = options.ContainsQuotedFields
? () => new RowByQuote(stream, Length, separator)
: () => new RowByLine(stream, Length);

var parser = (ReadOnlyMemory<char> memory, int i) =>
{
var fi = new TextFindHelper(memory.Span, separator, ('"', "\""));

return getValue(fi, i);
};

var parallelOptions = options.ParallelOptions ?? new();

return parallelOptions.Enabled
? GetRecordsParallel(parser, func, options.HasHeader, parallelOptions)
: GetRecordsSequential(parser, func, options.HasHeader);
}

// new overload that gives something like 'finder.GetField' to user do whatever he whats
public static IEnumerable<T> GetRecordsFast2<T>(this TextReader stream, VariableLengthReaderRawOptions options, string separator, Func<Func<int, string>, T> getValue)
{
Func<IFL> func = options.ContainsQuotedFields
? () => new RowByQuote(stream, Length, separator)
: () => new RowByLine(stream, Length);

var caches = Enumerable.Range(0, 10).Select(_ =>
{
var buffer = new string[options.ColumnCount];
return new
{
cache = options.StringPoolFactory?.Invoke(),
buffer = buffer,
func = new Func<int, string>(i => buffer[i])
};
}).ToArray();

var parser = (ReadOnlyMemory<char> memory, int i) =>
{
// memento here?
var fi = new TextFindHelper(memory.Span, separator, ('"', "\""));

var cache = caches[i % caches.Length];
lock (cache)
{
for (int j = 0; j < cache.buffer.Length; j++)
{
if (cache.cache is null)
cache.buffer[j] = fi.GetValue(j).ToString();
else
cache.buffer[j] = cache.cache(fi.GetValue(j));
}
return getValue(cache.func);
}
};

var parallelOptions = options.ParallelOptions ?? new();

return parallelOptions.Enabled
? GetRecordsParallel(parser, func, options.HasHeader, parallelOptions)
: GetRecordsSequential(parser, func, options.HasHeader);
}
}

public delegate T Parse<T>(TextFindHelper index, int i);
}