@@ -169,7 +169,24 @@ where
169
169
170
170
// If repetition levels present, we don't know how much more to read
171
171
// in order to read the requested number of records, therefore read at least
172
- // MIN_BATCH_SIZE, otherwise read exactly what was requested
172
+ // MIN_BATCH_SIZE, otherwise read **exactly** what was requested. This helps
173
+ // to avoid a degenerate case where the buffers are never fully drained.
174
+ //
175
+ // Consider the scenario where the user is requesting batches of MIN_BATCH_SIZE.
176
+ //
177
+ // When transitioning across a row group boundary, this will read some remainder
178
+ // from the row group `r`, before reading MIN_BATCH_SIZE from the next row group,
179
+ // leaving `MIN_BATCH_SIZE + r` in the buffer.
180
+ //
181
+ // The client will then only split off the `MIN_BATCH_SIZE` they actually wanted,
182
+ // leaving behind `r`. This will continue indefinitely.
183
+ //
184
+ // Aside from wasting cycles splitting and shuffling buffers unnecessarily, this
185
+ // prevents dictionary preservation from functioning correctly as the buffer
186
+ // will never be emptied, allowing a new dictionary to be registered.
187
+ //
188
+ // This degenerate case can still occur for repeated fields, but
189
+ // it is avoided for the more common case of a non-repeated field
173
190
let batch_size = match & self . rep_levels {
174
191
Some ( _) => max ( num_records - records_read, MIN_BATCH_SIZE ) ,
175
192
None => num_records - records_read,
0 commit comments