Skip to content

Commit 2619307

Browse files
fatemehppitrou
andauthored
PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
Add a test for `TypedColumnReader::Skip` with repeated values to make it clear that we are skipping values and not records. Also, add some comments to the existing test for Skip of non-repeated values. Lead-authored-by: Fatemah Panahi <panahi@google.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Co-authored-by: Fatemah Panahi <fatemehp@users.noreply.github.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent ebda85f commit 2619307

File tree

1 file changed

+64
-5
lines changed

1 file changed

+64
-5
lines changed

cpp/src/parquet/column_reader_test.cc

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) {
260260
ASSERT_NO_FATAL_FAILURE(ExecuteDict(num_pages, levels_per_page, &descr));
261261
}
262262

263-
TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
263+
// Tests skipping around page boundaries.
264+
TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) {
264265
int levels_per_page = 100;
265266
int num_pages = 5;
266267
max_def_level_ = 0;
@@ -289,10 +290,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
289290
values_.begin() + static_cast<int>(2.5 * static_cast<double>(levels_per_page)));
290291
ASSERT_TRUE(vector_equal(sub_values, vresult));
291292

292-
// 2) skip_size == page_size (skip across two pages)
293+
// 2) skip_size == page_size (skip across two pages from page 2.5 to 3.5)
293294
levels_skipped = reader->Skip(levels_per_page);
294295
ASSERT_EQ(levels_per_page, levels_skipped);
295-
// Read half a page
296+
// Read half a page (page 3.5 to 4)
296297
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
297298
&values_read);
298299
sub_values.clear();
@@ -303,10 +304,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
303304
ASSERT_TRUE(vector_equal(sub_values, vresult));
304305

305306
// 3) skip_size < page_size (skip limited to a single page)
306-
// Skip half a page
307+
// Skip half a page (page 4 to 4.5)
307308
levels_skipped = reader->Skip(levels_per_page / 2);
308309
ASSERT_EQ(0.5 * levels_per_page, levels_skipped);
309-
// Read half a page
310+
// Read half a page (page 4.5 to 5)
310311
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
311312
&values_read);
312313
sub_values.clear();
@@ -316,13 +317,71 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
316317
values_.end());
317318
ASSERT_TRUE(vector_equal(sub_values, vresult));
318319

320+
// 4) skip_size = 0
321+
levels_skipped = reader->Skip(0);
322+
ASSERT_EQ(0, levels_skipped);
323+
324+
// 5) Skip past the end page. There are 5 pages and we have either skipped
325+
// or read all of them, so there is nothing left to skip.
326+
levels_skipped = reader->Skip(10);
327+
ASSERT_EQ(0, levels_skipped);
328+
319329
values_.clear();
320330
def_levels_.clear();
321331
rep_levels_.clear();
322332
pages_.clear();
323333
reader_.reset();
324334
}
325335

336+
// Skip with repeated field. This test makes it clear that we are skipping
337+
// values and not records.
338+
TEST_F(TestPrimitiveReader, TestSkipRepeatedField) {
339+
// Example schema: message M { repeated int32 b = 1 }
340+
max_def_level_ = 1;
341+
max_rep_level_ = 1;
342+
NodePtr type = schema::Int32("b", Repetition::REPEATED);
343+
const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
344+
// Example rows: {}, {[10, 10]}, {[20, 20, 20]}
345+
std::vector<int32_t> values = {10, 10, 20, 20, 20};
346+
std::vector<int16_t> def_levels = {0, 1, 1, 1, 1, 1};
347+
std::vector<int16_t> rep_levels = {0, 0, 1, 0, 1, 1};
348+
num_values_ = static_cast<int>(def_levels.size());
349+
std::shared_ptr<DataPageV1> page = MakeDataPage<Int32Type>(
350+
&descr, values, num_values_, Encoding::PLAIN, /*indices=*/{},
351+
/*indices_size=*/0, def_levels, max_def_level_, rep_levels, max_rep_level_);
352+
353+
pages_.push_back(std::move(page));
354+
355+
InitReader(&descr);
356+
Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
357+
358+
// Vecotrs to hold read values, definition levels, and repetition levels.
359+
std::vector<int32_t> read_vals(4, -1);
360+
std::vector<int16_t> read_defs(4, -1);
361+
std::vector<int16_t> read_reps(4, -1);
362+
363+
// Skip two levels.
364+
int64_t levels_skipped = reader->Skip(2);
365+
ASSERT_EQ(2, levels_skipped);
366+
367+
int64_t num_read_values = 0;
368+
// Read the next set of values
369+
reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
370+
&num_read_values);
371+
ASSERT_EQ(num_read_values, 4);
372+
// Note that we end up in the record with {[10, 10]}
373+
ASSERT_TRUE(vector_equal({10, 20, 20, 20}, read_vals));
374+
ASSERT_TRUE(vector_equal({1, 1, 1, 1}, read_defs));
375+
ASSERT_TRUE(vector_equal({1, 0, 1, 1}, read_reps));
376+
377+
// No values remain in data page
378+
levels_skipped = reader->Skip(2);
379+
ASSERT_EQ(0, levels_skipped);
380+
reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
381+
&num_read_values);
382+
ASSERT_EQ(num_read_values, 0);
383+
}
384+
326385
// Page claims to have two values but only 1 is present.
327386
TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
328387
max_def_level_ = 1;

0 commit comments

Comments
 (0)