diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e029e13cdee6f..7094affaae16e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5225,6 +5225,8 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) { } TEST(TestArrowReadWrite, WriteRecordBatchNotProduceEmptyRowGroup) { + // GH-39211: WriteRecordBatch should prevent from writing a empty row group + // in the end of the file. auto pool = ::arrow::default_memory_pool(); auto sink = CreateOutputStream(); // Limit the max number of rows in a row group to 2 @@ -5247,18 +5249,18 @@ TEST(TestArrowReadWrite, WriteRecordBatchNotProduceEmptyRowGroup) { &arrow_writer)); // NewBufferedRowGroup() is not called explicitly and it will be called // inside WriteRecordBatch(). - // Write 50 rows for two times + // Write 20 rows for two times for (int i = 0; i < 2; ++i) { auto record_batch = - gen.BatchOf({::arrow::field("a", ::arrow::int64())}, /*length=*/50); + gen.BatchOf({::arrow::field("a", ::arrow::int64())}, /*length=*/20); ASSERT_OK_NO_THROW(arrow_writer->WriteRecordBatch(*record_batch)); } ASSERT_OK_NO_THROW(arrow_writer->Close()); ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); auto fileMetadata = arrow_writer->metadata(); - EXPECT_EQ(50, fileMetadata->num_row_groups()); - for (int i = 0; i < 50; ++i) { + EXPECT_EQ(20, fileMetadata->num_row_groups()); + for (int i = 0; i < 20; ++i) { EXPECT_EQ(2, fileMetadata->RowGroup(i)->num_rows()); } } diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 1a3589ddcef9c..5238986c428d3 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -419,6 +419,7 @@ class FileWriterImpl : public FileWriter { // Max number of rows allowed in a row group. const int64_t max_row_group_length = this->properties().max_row_group_length(); + // Initialize a new buffered row group writer if necessary. if (row_group_writer_ == nullptr || !row_group_writer_->buffered() || row_group_writer_->num_rows() >= max_row_group_length) { RETURN_NOT_OK(NewBufferedRowGroup()); @@ -461,7 +462,7 @@ class FileWriterImpl : public FileWriter { RETURN_NOT_OK(WriteBatch(offset, batch_size)); offset += batch_size; - // Flush current row group if it is full. + // Flush current row group writer and create a new writer if it is full. if (row_group_writer_->num_rows() >= max_row_group_length && offset < batch.num_rows()) { RETURN_NOT_OK(NewBufferedRowGroup());