diff --git a/cpp/src/parquet/file/metadata.cc b/cpp/src/parquet/file/metadata.cc index c1fd7679a236b..4e298a8f932d7 100644 --- a/cpp/src/parquet/file/metadata.cc +++ b/cpp/src/parquet/file/metadata.cc @@ -166,8 +166,9 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: - explicit RowGroupMetaDataImpl(const format::RowGroup* row_group) - : row_group_(row_group) {} + explicit RowGroupMetaDataImpl( + const format::RowGroup* row_group, const SchemaDescriptor* schema) + : row_group_(row_group), schema_(schema) {} ~RowGroupMetaDataImpl() {} inline int num_columns() const { return row_group_->columns.size(); } @@ -176,6 +177,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + inline const SchemaDescriptor* schema() const { return schema_; } + std::unique_ptr ColumnChunk(int i) { DCHECK(i < num_columns()) << "The file only has " << num_columns() << " columns, requested metadata for column: " << i; @@ -185,15 +188,18 @@ class RowGroupMetaData::RowGroupMetaDataImpl { private: const format::RowGroup* row_group_; + const SchemaDescriptor* schema_; }; -std::unique_ptr RowGroupMetaData::Make(const uint8_t* metadata) { - return std::unique_ptr(new RowGroupMetaData(metadata)); +std::unique_ptr RowGroupMetaData::Make( + const uint8_t* metadata, const SchemaDescriptor* schema) { + return std::unique_ptr(new RowGroupMetaData(metadata, schema)); } -RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata) +RowGroupMetaData::RowGroupMetaData( + const uint8_t* metadata, const SchemaDescriptor* schema) : impl_{std::unique_ptr(new RowGroupMetaDataImpl( - reinterpret_cast(metadata)))} {} + reinterpret_cast(metadata), schema))} {} RowGroupMetaData::~RowGroupMetaData() {} int RowGroupMetaData::num_columns() const { @@ -208,6 +214,10 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); } +const SchemaDescriptor* RowGroupMetaData::schema() const { + return impl_->schema(); +} + std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const { return impl_->ColumnChunk(i); } @@ -238,7 +248,7 @@ class FileMetaData::FileMetaDataImpl { << "The file only has " << num_row_groups() << " row groups, requested metadata for row group: " << i; return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i])); + reinterpret_cast(&metadata_->row_groups[i]), &schema_); } const SchemaDescriptor* schema_descriptor() const { return &schema_; } diff --git a/cpp/src/parquet/file/metadata.h b/cpp/src/parquet/file/metadata.h index c35f82f48012e..78ea53bc133c3 100644 --- a/cpp/src/parquet/file/metadata.h +++ b/cpp/src/parquet/file/metadata.h @@ -75,7 +75,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { class PARQUET_EXPORT RowGroupMetaData { public: // API convenience to get a MetaData accessor - static std::unique_ptr Make(const uint8_t* metadata); + static std::unique_ptr Make( + const uint8_t* metadata, const SchemaDescriptor* schema); ~RowGroupMetaData(); @@ -83,10 +84,12 @@ class PARQUET_EXPORT RowGroupMetaData { int num_columns() const; int64_t num_rows() const; int64_t total_byte_size() const; + // Return const-pointer to make it clear that this object is not to be copied + const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i) const; private: - explicit RowGroupMetaData(const uint8_t* metadata); + explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema); // PIMPL Idiom class RowGroupMetaDataImpl; std::unique_ptr impl_; diff --git a/cpp/src/parquet/file/reader-internal.cc b/cpp/src/parquet/file/reader-internal.cc index 5c1bc37cf270e..1cb91f0c305d1 100644 --- a/cpp/src/parquet/file/reader-internal.cc +++ b/cpp/src/parquet/file/reader-internal.cc @@ -145,6 +145,10 @@ const RowGroupMetaData* SerializedRowGroup::metadata() const { return row_group_metadata_.get(); } +const ReaderProperties* SerializedRowGroup::properties() const { + return &properties_; +} + std::unique_ptr SerializedRowGroup::GetColumnPageReader(int i) { // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i); @@ -195,8 +199,7 @@ std::shared_ptr SerializedFile::GetRowGroup(int i) { std::unique_ptr contents(new SerializedRowGroup( source_.get(), std::move(file_metadata_->RowGroup(i)), properties_)); - return std::make_shared( - file_metadata_->schema_descriptor(), std::move(contents), properties_.allocator()); + return std::make_shared(std::move(contents)); } const FileMetaData* SerializedFile::metadata() const { diff --git a/cpp/src/parquet/file/reader-internal.h b/cpp/src/parquet/file/reader-internal.h index 48e2daf8d7151..faedda084a49a 100644 --- a/cpp/src/parquet/file/reader-internal.h +++ b/cpp/src/parquet/file/reader-internal.h @@ -76,6 +76,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { virtual const RowGroupMetaData* metadata() const; + virtual const ReaderProperties* properties() const; + virtual std::unique_ptr GetColumnPageReader(int i); private: diff --git a/cpp/src/parquet/file/reader.cc b/cpp/src/parquet/file/reader.cc index 62481f73a7619..b593ea0cb266b 100644 --- a/cpp/src/parquet/file/reader.cc +++ b/cpp/src/parquet/file/reader.cc @@ -41,17 +41,18 @@ namespace parquet { // ---------------------------------------------------------------------- // RowGroupReader public API -RowGroupReader::RowGroupReader(const SchemaDescriptor* schema, - std::unique_ptr contents, MemoryAllocator* allocator) - : schema_(schema), contents_(std::move(contents)), allocator_(allocator) {} +RowGroupReader::RowGroupReader(std::unique_ptr contents) + : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < schema_->num_columns()) << "The RowGroup only has " << schema_->num_columns() - << "columns, requested column: " << i; - const ColumnDescriptor* descr = schema_->Column(i); + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; + const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); - return ColumnReader::Make(descr, std::move(page_reader), allocator_); + return ColumnReader::Make(descr, std::move(page_reader), + const_cast(contents_->properties())->allocator()); } // Returns the rowgroup metadata diff --git a/cpp/src/parquet/file/reader.h b/cpp/src/parquet/file/reader.h index baa3e30e9560b..c1ee9d4610137 100644 --- a/cpp/src/parquet/file/reader.h +++ b/cpp/src/parquet/file/reader.h @@ -38,15 +38,17 @@ class RandomAccessSource; class PARQUET_EXPORT RowGroupReader { public: - // Forward declare the PIMPL + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file struct Contents { virtual ~Contents() {} virtual std::unique_ptr GetColumnPageReader(int i) = 0; virtual const RowGroupMetaData* metadata() const = 0; + virtual const ReaderProperties* properties() const = 0; }; - RowGroupReader(const SchemaDescriptor* schema, std::unique_ptr contents, - MemoryAllocator* allocator); + explicit RowGroupReader(std::unique_ptr contents); // Returns the rowgroup metadata const RowGroupMetaData* metadata() const; @@ -56,17 +58,15 @@ class PARQUET_EXPORT RowGroupReader { std::shared_ptr Column(int i); private: - const SchemaDescriptor* schema_; - // PIMPL idiom - // This is declared in the .cc file so that we can hide compiled Thrift - // headers from the public API and also more easily create test fixtures. + // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; - MemoryAllocator* allocator_; }; class PARQUET_EXPORT ParquetFileReader { public: - // Forward declare the PIMPL + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file struct Contents { virtual ~Contents() {} // Perform any cleanup associated with the file contents @@ -99,9 +99,7 @@ class PARQUET_EXPORT ParquetFileReader { std::ostream& stream, std::list selected_columns, bool print_values = true); private: - // PIMPL idiom - // This is declared in the .cc file so that we can hide compiled Thrift - // headers from the public API and also more easily create test fixtures. + // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; }; diff --git a/cpp/src/parquet/file/writer.h b/cpp/src/parquet/file/writer.h index a5e6e99ffdfb7..0e649614602d6 100644 --- a/cpp/src/parquet/file/writer.h +++ b/cpp/src/parquet/file/writer.h @@ -35,6 +35,9 @@ class OutputStream; class PARQUET_EXPORT RowGroupWriter { public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file struct Contents { virtual int num_columns() const = 0; virtual int64_t num_rows() const = 0; @@ -75,8 +78,7 @@ class PARQUET_EXPORT RowGroupWriter { // Owned by the parent ParquetFileWriter const SchemaDescriptor* schema_; - // This is declared in the .cc file so that we can hide compiled Thrift - // headers from the public API and also more easily create test fixtures. + // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; MemoryAllocator* allocator_; @@ -84,6 +86,9 @@ class PARQUET_EXPORT RowGroupWriter { class PARQUET_EXPORT ParquetFileWriter { public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file struct Contents { virtual ~Contents() {} // Perform any cleanup associated with the file contents @@ -155,8 +160,7 @@ class PARQUET_EXPORT ParquetFileWriter { const ColumnDescriptor* column_schema(int i) const { return schema_->Column(i); } private: - // This is declared in the .cc file so that we can hide compiled Thrift - // headers from the public API and also more easily create test fixtures. + // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; // The SchemaDescriptor is provided by the Contents impl