Skip to content

Commit 2bba103

Browse files
committed
GH-48004: [C++][Parquet] Fix hang in ColumnReader benchmark
The benchmark was instantiating the ColumnReader with less values than it would later attempt to read. The `ReadBatch` method would then return 0 prematurely and the loop would never progress.
1 parent 430ad81 commit 2bba103

File tree

1 file changed

+15
-11
lines changed

1 file changed

+15
-11
lines changed

cpp/src/parquet/column_io_benchmark.cc

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "arrow/io/memory.h"
2222
#include "arrow/testing/random.h"
2323
#include "arrow/util/config.h"
24+
#include "arrow/util/logging.h"
2425

2526
#include "parquet/column_reader.h"
2627
#include "parquet/column_writer.h"
@@ -167,27 +168,29 @@ std::shared_ptr<Int64Reader> BuildReader(std::shared_ptr<Buffer>& buffer,
167168

168169
static void BM_ReadInt64Column(::benchmark::State& state, Repetition::type repetition,
169170
Compression::type codec, Encoding::type encoding) {
170-
format::ColumnChunk thrift_metadata;
171+
const auto kNumValues = state.range(0);
172+
const auto kBatchSize = state.range(1);
171173

172174
::arrow::random::RandomArrayGenerator rgen(1337);
173-
auto values = rgen.Int64(state.range(0), 0, 1000000, 0);
175+
auto values = rgen.Int64(kNumValues, 0, 1000000, 0);
174176
const auto& int64_values = static_cast<const ::arrow::Int64Array&>(*values);
175177

176-
std::vector<int16_t> definition_levels(state.range(0), 1);
177-
std::vector<int16_t> repetition_levels(state.range(0), 0);
178+
std::vector<int16_t> definition_levels(kNumValues, 1);
179+
std::vector<int16_t> repetition_levels(kNumValues, 0);
178180
std::shared_ptr<ColumnDescriptor> schema = Int64Schema(repetition);
179181
std::shared_ptr<WriterProperties> properties = WriterProperties::Builder()
180182
.compression(codec)
181183
->encoding(encoding)
182184
->disable_dictionary()
183185
->build();
184186

187+
format::ColumnChunk thrift_metadata;
185188
auto metadata = ColumnChunkMetaDataBuilder::Make(
186189
properties, schema.get(), reinterpret_cast<uint8_t*>(&thrift_metadata));
187190

188191
auto stream = CreateOutputStream();
189192
std::shared_ptr<Int64Writer> writer = BuildWriter(
190-
state.range(0), stream, metadata.get(), schema.get(), properties.get(), codec);
193+
kNumValues, stream, metadata.get(), schema.get(), properties.get(), codec);
191194
writer->WriteBatch(int64_values.length(), definition_levels.data(),
192195
repetition_levels.data(), int64_values.raw_values());
193196
writer->Close();
@@ -196,16 +199,17 @@ static void BM_ReadInt64Column(::benchmark::State& state, Repetition::type repet
196199
int64_t stream_size = src->size();
197200
int64_t data_size = int64_values.length() * sizeof(int64_t);
198201

199-
std::vector<int64_t> values_out(state.range(1));
200-
std::vector<int16_t> definition_levels_out(state.range(1));
201-
std::vector<int16_t> repetition_levels_out(state.range(1));
202+
std::vector<int64_t> values_out(kBatchSize);
203+
std::vector<int16_t> definition_levels_out(kBatchSize);
204+
std::vector<int16_t> repetition_levels_out(kBatchSize);
202205
while (state.KeepRunning()) {
203206
std::shared_ptr<Int64Reader> reader =
204-
BuildReader(src, state.range(1), codec, schema.get());
207+
BuildReader(src, kNumValues, codec, schema.get());
205208
int64_t values_read = 0;
206-
for (int64_t i = 0; i < int64_values.length(); i += values_read) {
207-
reader->ReadBatch(values_out.size(), definition_levels_out.data(),
209+
for (int64_t i = 0; i < kNumValues; i += values_read) {
210+
reader->ReadBatch(kBatchSize, definition_levels_out.data(),
208211
repetition_levels_out.data(), values_out.data(), &values_read);
212+
ARROW_CHECK_NE(values_read, 0) << "Unexpected end of column";
209213
}
210214
}
211215
SetBytesProcessed(state, repetition);

0 commit comments

Comments
 (0)