Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ParquetReader.RowGroups property #509

Merged
merged 7 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/Parquet.Test/ParquetReaderTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,18 @@ public async Task Metadata_file() {
Assert.True(reader.CustomMetadata.ContainsKey("geo"));
Assert.True(reader.CustomMetadata.ContainsKey("ARROW:schema"));
}

[Theory]
[InlineData("multi.page.parquet")]
[InlineData("multi.page.v2.parquet")]
public async Task RowGroups_EnumeratesRowGroups(string parquetFile) {
using(ParquetReader reader = await ParquetReader.CreateAsync(OpenTestFile(parquetFile), leaveStreamOpen: false)) {

Assert.Single(reader.RowGroups);
IParquetRowGroupReader rowGroup = reader.RowGroups.Single();
Assert.Equal(927861, rowGroup.RowCount);
}
}

class ReadableNonSeekableStream : DelegatedStream {
public ReadableNonSeekableStream(Stream master) : base(master) {
Expand Down
5 changes: 5 additions & 0 deletions src/Parquet/ParquetReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ public ParquetRowGroupReader OpenRowGroupReader(int index) {
return _groupReaders[index];
}

/// <summary>
/// Collection of row group readers, fast random access and enumeration
/// </summary>
public IReadOnlyList<IParquetRowGroupReader> RowGroups => _groupReaders;

/// <summary>
/// Reads entire row group's data columns in one go.
/// </summary>
Expand Down
55 changes: 48 additions & 7 deletions src/Parquet/ParquetRowGroupReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,55 @@
using Parquet.Schema;

namespace Parquet {
/// <summary>
/// Operations available on a row group reader, omitting Dispose, which is
/// exposed on the implementing class for backward compatibility only.
/// </summary>
public interface IParquetRowGroupReader {
/// <summary>
/// Exposes raw metadata about this row group
/// </summary>
RowGroup RowGroup { get; }

/// <summary>
/// Gets the number of rows in this row group
/// </summary>
long RowCount { get; }

/// <summary>
/// Checks if this field exists in source schema
/// </summary>
bool ColumnExists(DataField field);

/// <summary>
/// Reads a column from this row group. Unlike writing, columns can be read in any order.
/// If the column is missing, an exception will be thrown.
/// </summary>
Task<DataColumn> ReadColumnAsync(DataField field, CancellationToken cancellationToken = default);

/// <summary>
/// Gets raw column chunk metadata for this field
/// </summary>
ColumnChunk? GetMetadata(DataField field);

/// <summary>
/// Get custom key-value metadata for a data field
/// </summary>
Dictionary<string, string> GetCustomMetadata(DataField field);

/// <summary>
/// Returns data column statistics for a particular data field
/// </summary>
/// <param name="field"></param>
/// <returns></returns>
/// <exception cref="ParquetException"></exception>
DataColumnStatistics? GetStatistics(DataField field);
}

/// <summary>
/// Reader for Parquet row groups
/// </summary>
public class ParquetRowGroupReader : IDisposable {
public class ParquetRowGroupReader : IDisposable, IParquetRowGroupReader {
private readonly RowGroup _rowGroup;
private readonly ThriftFooter _footer;
private readonly Stream _stream;
Expand Down Expand Up @@ -118,12 +163,8 @@ public Dictionary<string, string> GetCustomMetadata(DataField field) {
}

/// <summary>
///
/// Dispose isn't required, retained for backward compatibility
/// </summary>
public void Dispose() {
//don't need to dispose anything here, but for clarity we implement IDisposable and client must use it as we may add something
//important in it later
}

public void Dispose() { }
}
}
Loading