@@ -204,9 +204,9 @@ cdef enum ColumnTypes:
204
204
205
205
# type the page_data types
206
206
cdef int page_meta_type = const.page_meta_type
207
- cdef int page_mix_types_0 = const.page_mix_types[0 ]
208
- cdef int page_mix_types_1 = const.page_mix_types[1 ]
209
207
cdef int page_data_type = const.page_data_type
208
+ cdef int page_mix_type = const.page_mix_type
209
+ cdef int page_type_mask = const.page_type_mask
210
210
cdef int subheader_pointers_offset = const.subheader_pointers_offset
211
211
212
212
@@ -219,7 +219,7 @@ cdef class Parser(object):
219
219
int64_t[:] column_types
220
220
uint8_t[:, :] byte_chunk
221
221
object [:, :] string_chunk
222
- char * cached_page
222
+ uint8_t * cached_page
223
223
int current_row_on_page_index
224
224
int current_page_block_count
225
225
int current_page_data_subheader_pointers_len
@@ -231,6 +231,7 @@ cdef class Parser(object):
231
231
int bit_offset
232
232
int subheader_pointer_length
233
233
int current_page_type
234
+ int current_page_deleted_rows_bitmap_offset
234
235
bint is_little_endian
235
236
const uint8_t[:] (* decompress)(int result_length,
236
237
const uint8_t[:] inbuff)
@@ -253,6 +254,7 @@ cdef class Parser(object):
253
254
self .subheader_pointer_length = self .parser._subheader_pointer_length
254
255
self .is_little_endian = parser.byte_order == " <"
255
256
self .column_types = np.empty(self .column_count, dtype = ' int64' )
257
+ self .current_page_deleted_rows_bitmap_offset = - 1
256
258
257
259
# page indicators
258
260
self .update_next_page()
@@ -309,10 +311,55 @@ cdef class Parser(object):
309
311
self .update_next_page()
310
312
return done
311
313
314
+ cdef int calculate_deleted_rows_bitmap_offset(self ):
315
+ """ Calculate where the deleted rows bitmap is located
316
+ in the page. It is _current_page_deleted_rows_bitmap_offset's
317
+ bytes away from the end of the row values"""
318
+
319
+ cdef:
320
+ int deleted_rows_bitmap_offset, page_type
321
+ int subheader_pointers_length, align_correction
322
+ int row_count
323
+
324
+ if self .parser._current_page_deleted_rows_bitmap_offset is None :
325
+ return - 1
326
+
327
+ deleted_rows_bitmap_offset = \
328
+ self .parser._current_page_deleted_rows_bitmap_offset
329
+
330
+ page_type = self .current_page_type
331
+ subheader_pointers_length = \
332
+ self .subheader_pointer_length * self .current_page_subheaders_count
333
+
334
+ if page_type & page_type_mask == page_data_type:
335
+ return (
336
+ self .bit_offset +
337
+ subheader_pointers_offset +
338
+ self .row_length * self .current_page_block_count +
339
+ deleted_rows_bitmap_offset)
340
+ elif page_type & page_type_mask == page_mix_type:
341
+ align_correction = (
342
+ self .bit_offset +
343
+ subheader_pointers_offset +
344
+ subheader_pointers_length
345
+ ) % 8
346
+ row_count = min (self .parser._mix_page_row_count,
347
+ self .parser.row_count)
348
+ return (
349
+ self .bit_offset +
350
+ subheader_pointers_offset +
351
+ subheader_pointers_length +
352
+ align_correction +
353
+ self .row_length * row_count +
354
+ deleted_rows_bitmap_offset)
355
+ else :
356
+ # I have never seen this case.
357
+ return - 1
358
+
312
359
cdef update_next_page(self ):
313
360
# update data for the current page
314
361
315
- self .cached_page = < char * > self .parser._cached_page
362
+ self .cached_page = < uint8_t * > self .parser._cached_page
316
363
self .current_row_on_page_index = 0
317
364
self .current_page_type = self .parser._current_page_type
318
365
self .current_page_block_count = self .parser._current_page_block_count
@@ -321,11 +368,29 @@ cdef class Parser(object):
321
368
self .current_page_subheaders_count = \
322
369
self .parser._current_page_subheaders_count
323
370
371
+ self .current_page_deleted_rows_bitmap_offset = \
372
+ self .calculate_deleted_rows_bitmap_offset()
373
+
374
+ cdef bint is_row_deleted(self , int row_number):
375
+ cdef:
376
+ int row_disk
377
+ unsigned char byte, row_bit
378
+ if self .current_page_deleted_rows_bitmap_offset == - 1 :
379
+ return 0
380
+ row_idx = (row_number + 1 ) // 8
381
+ row_bit = 1 << (7 - (row_number % 8 ))
382
+
383
+ byte = self .cached_page[
384
+ self .current_page_deleted_rows_bitmap_offset + row_idx]
385
+
386
+ return byte & row_bit
387
+
324
388
cdef readline(self ):
325
389
326
390
cdef:
327
391
int offset, bit_offset, align_correction
328
392
int subheader_pointer_length, mn
393
+ int block_count
329
394
bint done, flag
330
395
331
396
bit_offset = self .bit_offset
@@ -340,7 +405,7 @@ cdef class Parser(object):
340
405
341
406
# Loop until a data row is read
342
407
while True :
343
- if self .current_page_type == page_meta_type:
408
+ if self .current_page_type & page_type_mask == page_meta_type:
344
409
flag = self .current_row_on_page_index >= \
345
410
self .current_page_data_subheader_pointers_len
346
411
if flag:
@@ -355,8 +420,7 @@ cdef class Parser(object):
355
420
current_subheader_pointer.offset,
356
421
current_subheader_pointer.length)
357
422
return False
358
- elif (self .current_page_type == page_mix_types_0 or
359
- self .current_page_type == page_mix_types_1):
423
+ elif self .current_page_type & page_type_mask == page_mix_type:
360
424
align_correction = (bit_offset + subheader_pointers_offset +
361
425
self .current_page_subheaders_count *
362
426
subheader_pointer_length)
@@ -365,21 +429,35 @@ cdef class Parser(object):
365
429
offset += subheader_pointers_offset
366
430
offset += (self .current_page_subheaders_count *
367
431
subheader_pointer_length)
368
- offset += self .current_row_on_page_index * self .row_length
369
- self .process_byte_array_with_data(offset,
370
- self .row_length)
432
+
433
+ # Skip past rows marked as deleted
371
434
mn = min (self .parser.row_count,
372
435
self .parser._mix_page_row_count)
436
+ while (self .is_row_deleted(self .current_row_on_page_index) and
437
+ self .current_row_on_page_index < mn):
438
+ self .current_row_on_page_index += 1
439
+
440
+ if self .current_row_on_page_index < mn:
441
+ offset += self .current_row_on_page_index * self .row_length
442
+ self .process_byte_array_with_data(offset, self .row_length)
373
443
if self .current_row_on_page_index == mn:
374
444
done = self .read_next_page()
375
445
if done:
376
446
return True
377
447
return False
378
- elif self .current_page_type & page_data_type == page_data_type:
379
- self .process_byte_array_with_data(
380
- bit_offset + subheader_pointers_offset +
381
- self .current_row_on_page_index * self .row_length,
382
- self .row_length)
448
+ elif self .current_page_type & page_type_mask == page_data_type:
449
+ block_count = self .current_page_block_count
450
+
451
+ # Skip past rows marked as deleted
452
+ while (self .is_row_deleted(self .current_row_on_page_index) and
453
+ self .current_row_on_page_index != block_count):
454
+ self .current_row_on_page_index += 1
455
+
456
+ if self .current_row_on_page_index < block_count:
457
+ self .process_byte_array_with_data(
458
+ bit_offset + subheader_pointers_offset +
459
+ self .current_row_on_page_index * self .row_length,
460
+ self .row_length)
383
461
flag = (self .current_row_on_page_index ==
384
462
self .current_page_block_count)
385
463
if flag:
0 commit comments