Skip to content

Commit fcb1699

Browse files
committed
BUG: Filter out deleted rows from sas7bdat files (pandas-dev#15963)
Sas7bdat may contain rows which are actually deleted. If the page_type has bit 128 set, there is a bitmap following the normal row data with a bit set for a given row if it has been deleted. Use that information to not include deleted rows in the resulting dataframe.
1 parent f65fa75 commit fcb1699

File tree

9 files changed

+10137
-29
lines changed

9 files changed

+10137
-29
lines changed

doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,8 @@ I/O
760760
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
761761
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
762762
- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`)
763+
- :func:`read_sas()` will not include rows in sas7bdat files that has been marked as deleted by SAS, but are still present in the file. (:issue:`15963`)
764+
763765

764766
Plotting
765767
^^^^^^^^

pandas/io/sas/sas.pyx

+93-15
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ cdef enum ColumnTypes:
204204

205205
# type the page_data types
206206
cdef int page_meta_type = const.page_meta_type
207-
cdef int page_mix_types_0 = const.page_mix_types[0]
208-
cdef int page_mix_types_1 = const.page_mix_types[1]
209207
cdef int page_data_type = const.page_data_type
208+
cdef int page_mix_type = const.page_mix_type
209+
cdef int page_type_mask = const.page_type_mask
210210
cdef int subheader_pointers_offset = const.subheader_pointers_offset
211211

212212

@@ -219,7 +219,7 @@ cdef class Parser(object):
219219
int64_t[:] column_types
220220
uint8_t[:, :] byte_chunk
221221
object[:, :] string_chunk
222-
char *cached_page
222+
uint8_t *cached_page
223223
int current_row_on_page_index
224224
int current_page_block_count
225225
int current_page_data_subheader_pointers_len
@@ -231,6 +231,7 @@ cdef class Parser(object):
231231
int bit_offset
232232
int subheader_pointer_length
233233
int current_page_type
234+
int current_page_deleted_rows_bitmap_offset
234235
bint is_little_endian
235236
const uint8_t[:] (*decompress)(int result_length,
236237
const uint8_t[:] inbuff)
@@ -253,6 +254,7 @@ cdef class Parser(object):
253254
self.subheader_pointer_length = self.parser._subheader_pointer_length
254255
self.is_little_endian = parser.byte_order == "<"
255256
self.column_types = np.empty(self.column_count, dtype='int64')
257+
self.current_page_deleted_rows_bitmap_offset = -1
256258

257259
# page indicators
258260
self.update_next_page()
@@ -309,10 +311,55 @@ cdef class Parser(object):
309311
self.update_next_page()
310312
return done
311313

314+
cdef int calculate_deleted_rows_bitmap_offset(self):
315+
"""Calculate where the deleted rows bitmap is located
316+
in the page. It is _current_page_deleted_rows_bitmap_offset's
317+
bytes away from the end of the row values"""
318+
319+
cdef:
320+
int deleted_rows_bitmap_offset, page_type
321+
int subheader_pointers_length, align_correction
322+
int row_count
323+
324+
if self.parser._current_page_deleted_rows_bitmap_offset is None:
325+
return -1
326+
327+
deleted_rows_bitmap_offset = \
328+
self.parser._current_page_deleted_rows_bitmap_offset
329+
330+
page_type = self.current_page_type
331+
subheader_pointers_length = \
332+
self.subheader_pointer_length * self.current_page_subheaders_count
333+
334+
if page_type & page_type_mask == page_data_type:
335+
return (
336+
self.bit_offset +
337+
subheader_pointers_offset +
338+
self.row_length * self.current_page_block_count +
339+
deleted_rows_bitmap_offset)
340+
elif page_type & page_type_mask == page_mix_type:
341+
align_correction = (
342+
self.bit_offset +
343+
subheader_pointers_offset +
344+
subheader_pointers_length
345+
) % 8
346+
row_count = min(self.parser._mix_page_row_count,
347+
self.parser.row_count)
348+
return (
349+
self.bit_offset +
350+
subheader_pointers_offset +
351+
subheader_pointers_length +
352+
align_correction +
353+
self.row_length * row_count +
354+
deleted_rows_bitmap_offset)
355+
else:
356+
# I have never seen this case.
357+
return -1
358+
312359
cdef update_next_page(self):
313360
# update data for the current page
314361

315-
self.cached_page = <char *>self.parser._cached_page
362+
self.cached_page = <uint8_t * >self.parser._cached_page
316363
self.current_row_on_page_index = 0
317364
self.current_page_type = self.parser._current_page_type
318365
self.current_page_block_count = self.parser._current_page_block_count
@@ -321,11 +368,29 @@ cdef class Parser(object):
321368
self.current_page_subheaders_count =\
322369
self.parser._current_page_subheaders_count
323370

371+
self.current_page_deleted_rows_bitmap_offset =\
372+
self.calculate_deleted_rows_bitmap_offset()
373+
374+
cdef bint is_row_deleted(self, int row_number):
375+
cdef:
376+
int row_disk
377+
unsigned char byte, row_bit
378+
if self.current_page_deleted_rows_bitmap_offset == -1:
379+
return 0
380+
row_idx = (row_number + 1) // 8
381+
row_bit = 1 << (7 - (row_number % 8))
382+
383+
byte = self.cached_page[
384+
self.current_page_deleted_rows_bitmap_offset + row_idx]
385+
386+
return byte & row_bit
387+
324388
cdef readline(self):
325389

326390
cdef:
327391
int offset, bit_offset, align_correction
328392
int subheader_pointer_length, mn
393+
int block_count
329394
bint done, flag
330395

331396
bit_offset = self.bit_offset
@@ -340,7 +405,7 @@ cdef class Parser(object):
340405

341406
# Loop until a data row is read
342407
while True:
343-
if self.current_page_type == page_meta_type:
408+
if self.current_page_type & page_type_mask == page_meta_type:
344409
flag = self.current_row_on_page_index >=\
345410
self.current_page_data_subheader_pointers_len
346411
if flag:
@@ -355,8 +420,7 @@ cdef class Parser(object):
355420
current_subheader_pointer.offset,
356421
current_subheader_pointer.length)
357422
return False
358-
elif (self.current_page_type == page_mix_types_0 or
359-
self.current_page_type == page_mix_types_1):
423+
elif self.current_page_type & page_type_mask == page_mix_type:
360424
align_correction = (bit_offset + subheader_pointers_offset +
361425
self.current_page_subheaders_count *
362426
subheader_pointer_length)
@@ -365,21 +429,35 @@ cdef class Parser(object):
365429
offset += subheader_pointers_offset
366430
offset += (self.current_page_subheaders_count *
367431
subheader_pointer_length)
368-
offset += self.current_row_on_page_index * self.row_length
369-
self.process_byte_array_with_data(offset,
370-
self.row_length)
432+
433+
# Skip past rows marked as deleted
371434
mn = min(self.parser.row_count,
372435
self.parser._mix_page_row_count)
436+
while (self.is_row_deleted(self.current_row_on_page_index) and
437+
self.current_row_on_page_index < mn):
438+
self.current_row_on_page_index += 1
439+
440+
if self.current_row_on_page_index < mn:
441+
offset += self.current_row_on_page_index * self.row_length
442+
self.process_byte_array_with_data(offset, self.row_length)
373443
if self.current_row_on_page_index == mn:
374444
done = self.read_next_page()
375445
if done:
376446
return True
377447
return False
378-
elif self.current_page_type & page_data_type == page_data_type:
379-
self.process_byte_array_with_data(
380-
bit_offset + subheader_pointers_offset +
381-
self.current_row_on_page_index * self.row_length,
382-
self.row_length)
448+
elif self.current_page_type & page_type_mask == page_data_type:
449+
block_count = self.current_page_block_count
450+
451+
# Skip past rows marked as deleted
452+
while (self.is_row_deleted(self.current_row_on_page_index) and
453+
self.current_row_on_page_index != block_count):
454+
self.current_row_on_page_index += 1
455+
456+
if self.current_row_on_page_index < block_count:
457+
self.process_byte_array_with_data(
458+
bit_offset + subheader_pointers_offset +
459+
self.current_row_on_page_index * self.row_length,
460+
self.row_length)
383461
flag = (self.current_row_on_page_index ==
384462
self.current_page_block_count)
385463
if flag:

pandas/io/sas/sas7bdat.py

+21-12
Original file line numberDiff line numberDiff line change
@@ -298,12 +298,12 @@ def _parse_metadata(self):
298298

299299
def _process_page_meta(self):
300300
self._read_page_header()
301-
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
302-
if self._current_page_type in pt:
301+
pt = [const.page_meta_type, const.page_amd_type, const.page_mix_type]
302+
page_type = self._current_page_type
303+
if page_type & const.page_type_mask in pt:
303304
self._process_page_metadata()
304-
is_data_page = self._current_page_type & const.page_data_type
305-
is_mix_page = self._current_page_type in const.page_mix_types
306-
return (is_data_page or is_mix_page
305+
pt = [const.page_mix_type, const.page_data_type]
306+
return (page_type & const.page_type_mask in pt
307307
or self._current_page_data_subheader_pointers != [])
308308

309309
def _read_page_header(self):
@@ -313,6 +313,12 @@ def _read_page_header(self):
313313
tx = const.block_count_offset + bit_offset
314314
self._current_page_block_count = self._read_int(
315315
tx, const.block_count_length)
316+
if self._current_page_type & const.page_has_deleted_rows_bitmap:
317+
tx = const.page_deleted_rows_bitmap_offset * self._int_length
318+
self._current_page_deleted_rows_bitmap_offset = self._read_int(
319+
tx, self._int_length)
320+
else:
321+
self._current_page_deleted_rows_bitmap_offset = None
316322
tx = const.subheader_count_offset + bit_offset
317323
self._current_page_subheaders_count = (
318324
self._read_int(tx, const.subheader_count_length))
@@ -420,6 +426,9 @@ def _process_rowsize_subheader(self, offset, length):
420426
offset + const.row_length_offset_multiplier * int_len, int_len)
421427
self.row_count = self._read_int(
422428
offset + const.row_count_offset_multiplier * int_len, int_len)
429+
self.rows_deleted_count = self._read_int(
430+
offset + const.rows_deleted_count_offset_multiplier * int_len,
431+
int_len)
423432
self.col_count_p1 = self._read_int(
424433
offset + const.col_count_p1_multiplier * int_len, int_len)
425434
self.col_count_p2 = self._read_int(
@@ -601,19 +610,20 @@ def _process_format_subheader(self, offset, length):
601610

602611
def read(self, nrows=None):
603612

613+
row_count = self.row_count - self.rows_deleted_count
604614
if (nrows is None) and (self.chunksize is not None):
605615
nrows = self.chunksize
606616
elif nrows is None:
607-
nrows = self.row_count
617+
nrows = row_count
608618

609619
if len(self._column_types) == 0:
610620
self.close()
611621
raise EmptyDataError("No columns to parse from file")
612622

613-
if self._current_row_in_file_index >= self.row_count:
623+
if self._current_row_in_file_index >= row_count:
614624
return None
615625

616-
m = self.row_count - self._current_row_in_file_index
626+
m = row_count - self._current_row_in_file_index
617627
if nrows > m:
618628
nrows = m
619629

@@ -647,12 +657,11 @@ def _read_next_page(self):
647657

648658
self._read_page_header()
649659
page_type = self._current_page_type
650-
if page_type == const.page_meta_type:
660+
if page_type & const.page_type_mask == const.page_meta_type:
651661
self._process_page_metadata()
652662

653-
is_data_page = page_type & const.page_data_type
654-
pt = [const.page_meta_type] + const.page_mix_types
655-
if not is_data_page and self._current_page_type not in pt:
663+
pt = [const.page_meta_type, const.page_mix_type, const.page_data_type]
664+
if page_type & const.page_type_mask not in pt:
656665
return self._read_next_page()
657666

658667
return False

pandas/io/sas/sas_constants.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
os_name_length = 16
4444
page_bit_offset_x86 = 16
4545
page_bit_offset_x64 = 32
46+
page_deleted_rows_bitmap_offset = 3
4647
subheader_pointer_length_x86 = 12
4748
subheader_pointer_length_x64 = 24
4849
page_type_offset = 0
@@ -52,18 +53,23 @@
5253
subheader_count_offset = 4
5354
subheader_count_length = 2
5455
page_meta_type = 0
56+
# If page_type has bit 7 set there may be deleted rows.
57+
# These are marked in a bitmap following the row data.
58+
page_has_deleted_rows_bitmap = 128
5559
page_data_type = 256
5660
page_amd_type = 1024
5761
page_metc_type = 16384
5862
page_comp_type = -28672
59-
page_mix_types = [512, 640]
63+
page_mix_type = 512
64+
page_type_mask = (page_data_type | page_mix_type | page_amd_type)
6065
subheader_pointers_offset = 8
6166
truncated_subheader_id = 1
6267
compressed_subheader_id = 4
6368
compressed_subheader_type = 1
6469
text_block_size_length = 2
6570
row_length_offset_multiplier = 5
6671
row_count_offset_multiplier = 6
72+
rows_deleted_count_offset_multiplier = 7
6773
col_count_p1_multiplier = 9
6874
col_count_p2_multiplier = 10
6975
row_count_on_mix_page_offset_multiplier = 15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Date1,Date2,DateTime,DateTimeHi,Taiw
2+
1960-01-06,1960-01-04,1677-09-21 00:12:44,1677-09-21 00:12:43.145225525,1912-01-01
3+
1960-01-03,1960-01-05,2262-04-11 23:47:16,1960-01-01 00:00:00.000000000,1960-01-02
4+
1960-01-06,1960-01-04,1677-09-21 00:12:44,2262-04-11 23:47:16.854774475,1912-01-01
Binary file not shown.

0 commit comments

Comments
 (0)