Skip to content

Commit

Permalink
BUG: Filter out deleted rows from sas7bdat files (pandas-dev#15963)
Browse files Browse the repository at this point in the history
Sas7bdat may contain rows which are actually deleted.

If the page_type has bit 128 set, there is a bitmap following
the normal row data with a bit set for a given row if it has been
deleted. Use that information to not include deleted rows in
the resulting dataframe.
  • Loading branch information
troels committed Sep 9, 2018
1 parent f05ebc8 commit b8696f9
Show file tree
Hide file tree
Showing 9 changed files with 10,137 additions and 29 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,8 @@ I/O
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
- :func:`read_sas` will not include rows in sas7bdat files that has been marked as deleted by SAS, but are still present in the file. (:issue:`15963`)


Plotting
^^^^^^^^
Expand Down
108 changes: 93 additions & 15 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ cdef enum ColumnTypes:

# type the page_data types
cdef int page_meta_type = const.page_meta_type
cdef int page_mix_types_0 = const.page_mix_types[0]
cdef int page_mix_types_1 = const.page_mix_types[1]
cdef int page_data_type = const.page_data_type
cdef int page_mix_type = const.page_mix_type
cdef int page_type_mask = const.page_type_mask
cdef int subheader_pointers_offset = const.subheader_pointers_offset


Expand All @@ -219,7 +219,7 @@ cdef class Parser(object):
int64_t[:] column_types
uint8_t[:, :] byte_chunk
object[:, :] string_chunk
char *cached_page
uint8_t *cached_page
int current_row_on_page_index
int current_page_block_count
int current_page_data_subheader_pointers_len
Expand All @@ -231,6 +231,7 @@ cdef class Parser(object):
int bit_offset
int subheader_pointer_length
int current_page_type
int current_page_deleted_rows_bitmap_offset
bint is_little_endian
const uint8_t[:] (*decompress)(int result_length,
const uint8_t[:] inbuff)
Expand All @@ -253,6 +254,7 @@ cdef class Parser(object):
self.subheader_pointer_length = self.parser._subheader_pointer_length
self.is_little_endian = parser.byte_order == "<"
self.column_types = np.empty(self.column_count, dtype='int64')
self.current_page_deleted_rows_bitmap_offset = -1

# page indicators
self.update_next_page()
Expand Down Expand Up @@ -309,10 +311,55 @@ cdef class Parser(object):
self.update_next_page()
return done

cdef int calculate_deleted_rows_bitmap_offset(self):
"""Calculate where the deleted rows bitmap is located
in the page. It is _current_page_deleted_rows_bitmap_offset's
bytes away from the end of the row values"""

cdef:
int deleted_rows_bitmap_offset, page_type
int subheader_pointers_length, align_correction
int row_count

if self.parser._current_page_deleted_rows_bitmap_offset is None:
return -1

deleted_rows_bitmap_offset = \
self.parser._current_page_deleted_rows_bitmap_offset

page_type = self.current_page_type
subheader_pointers_length = \
self.subheader_pointer_length * self.current_page_subheaders_count

if page_type & page_type_mask == page_data_type:
return (
self.bit_offset +
subheader_pointers_offset +
self.row_length * self.current_page_block_count +
deleted_rows_bitmap_offset)
elif page_type & page_type_mask == page_mix_type:
align_correction = (
self.bit_offset +
subheader_pointers_offset +
subheader_pointers_length
) % 8
row_count = min(self.parser._mix_page_row_count,
self.parser.row_count)
return (
self.bit_offset +
subheader_pointers_offset +
subheader_pointers_length +
align_correction +
self.row_length * row_count +
deleted_rows_bitmap_offset)
else:
# I have never seen this case.
return -1

cdef update_next_page(self):
# update data for the current page

self.cached_page = <char *>self.parser._cached_page
self.cached_page = <uint8_t * >self.parser._cached_page
self.current_row_on_page_index = 0
self.current_page_type = self.parser._current_page_type
self.current_page_block_count = self.parser._current_page_block_count
Expand All @@ -321,11 +368,29 @@ cdef class Parser(object):
self.current_page_subheaders_count =\
self.parser._current_page_subheaders_count

self.current_page_deleted_rows_bitmap_offset =\
self.calculate_deleted_rows_bitmap_offset()

cdef bint is_row_deleted(self, int row_number):
cdef:
int row_disk
unsigned char byte, row_bit
if self.current_page_deleted_rows_bitmap_offset == -1:
return 0
row_idx = (row_number + 1) // 8
row_bit = 1 << (7 - (row_number % 8))

byte = self.cached_page[
self.current_page_deleted_rows_bitmap_offset + row_idx]

return byte & row_bit

cdef readline(self):

cdef:
int offset, bit_offset, align_correction
int subheader_pointer_length, mn
int block_count
bint done, flag

bit_offset = self.bit_offset
Expand All @@ -340,7 +405,7 @@ cdef class Parser(object):

# Loop until a data row is read
while True:
if self.current_page_type == page_meta_type:
if self.current_page_type & page_type_mask == page_meta_type:
flag = self.current_row_on_page_index >=\
self.current_page_data_subheader_pointers_len
if flag:
Expand All @@ -355,8 +420,7 @@ cdef class Parser(object):
current_subheader_pointer.offset,
current_subheader_pointer.length)
return False
elif (self.current_page_type == page_mix_types_0 or
self.current_page_type == page_mix_types_1):
elif self.current_page_type & page_type_mask == page_mix_type:
align_correction = (bit_offset + subheader_pointers_offset +
self.current_page_subheaders_count *
subheader_pointer_length)
Expand All @@ -365,21 +429,35 @@ cdef class Parser(object):
offset += subheader_pointers_offset
offset += (self.current_page_subheaders_count *
subheader_pointer_length)
offset += self.current_row_on_page_index * self.row_length
self.process_byte_array_with_data(offset,
self.row_length)

# Skip past rows marked as deleted
mn = min(self.parser.row_count,
self.parser._mix_page_row_count)
while (self.is_row_deleted(self.current_row_on_page_index) and
self.current_row_on_page_index < mn):
self.current_row_on_page_index += 1

if self.current_row_on_page_index < mn:
offset += self.current_row_on_page_index * self.row_length
self.process_byte_array_with_data(offset, self.row_length)
if self.current_row_on_page_index == mn:
done = self.read_next_page()
if done:
return True
return False
elif self.current_page_type & page_data_type == page_data_type:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
self.row_length)
elif self.current_page_type & page_type_mask == page_data_type:
block_count = self.current_page_block_count

# Skip past rows marked as deleted
while (self.is_row_deleted(self.current_row_on_page_index) and
self.current_row_on_page_index != block_count):
self.current_row_on_page_index += 1

if self.current_row_on_page_index < block_count:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
self.row_length)
flag = (self.current_row_on_page_index ==
self.current_page_block_count)
if flag:
Expand Down
33 changes: 21 additions & 12 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,12 @@ def _parse_metadata(self):

def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
pt = [const.page_meta_type, const.page_amd_type, const.page_mix_type]
page_type = self._current_page_type
if page_type & const.page_type_mask in pt:
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
pt = [const.page_mix_type, const.page_data_type]
return (page_type & const.page_type_mask in pt
or self._current_page_data_subheader_pointers != [])

def _read_page_header(self):
Expand All @@ -313,6 +313,12 @@ def _read_page_header(self):
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(
tx, const.block_count_length)
if self._current_page_type & const.page_has_deleted_rows_bitmap:
tx = const.page_deleted_rows_bitmap_offset * self._int_length
self._current_page_deleted_rows_bitmap_offset = self._read_int(
tx, self._int_length)
else:
self._current_page_deleted_rows_bitmap_offset = None
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = (
self._read_int(tx, const.subheader_count_length))
Expand Down Expand Up @@ -420,6 +426,9 @@ def _process_rowsize_subheader(self, offset, length):
offset + const.row_length_offset_multiplier * int_len, int_len)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len)
self.rows_deleted_count = self._read_int(
offset + const.rows_deleted_count_offset_multiplier * int_len,
int_len)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len)
self.col_count_p2 = self._read_int(
Expand Down Expand Up @@ -601,19 +610,20 @@ def _process_format_subheader(self, offset, length):

def read(self, nrows=None):

row_count = self.row_count - self.rows_deleted_count
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
nrows = row_count

if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")

if self._current_row_in_file_index >= self.row_count:
if self._current_row_in_file_index >= row_count:
return None

m = self.row_count - self._current_row_in_file_index
m = row_count - self._current_row_in_file_index
if nrows > m:
nrows = m

Expand Down Expand Up @@ -647,12 +657,11 @@ def _read_next_page(self):

self._read_page_header()
page_type = self._current_page_type
if page_type == const.page_meta_type:
if page_type & const.page_type_mask == const.page_meta_type:
self._process_page_metadata()

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
pt = [const.page_meta_type, const.page_mix_type, const.page_data_type]
if page_type & const.page_type_mask not in pt:
return self._read_next_page()

return False
Expand Down
8 changes: 7 additions & 1 deletion pandas/io/sas/sas_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
page_deleted_rows_bitmap_offset = 3
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
Expand All @@ -52,18 +53,23 @@
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
# If page_type has bit 7 set there may be deleted rows.
# These are marked in a bitmap following the row data.
page_has_deleted_rows_bitmap = 128
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
page_mix_type = 512
page_type_mask = (page_data_type | page_mix_type | page_amd_type)
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
rows_deleted_count_offset_multiplier = 7
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/io/sas/data/datetime_deleted_rows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Date1,Date2,DateTime,DateTimeHi,Taiw
1960-01-06,1960-01-04,1677-09-21 00:12:44,1677-09-21 00:12:43.145225525,1912-01-01
1960-01-03,1960-01-05,2262-04-11 23:47:16,1960-01-01 00:00:00.000000000,1960-01-02
1960-01-06,1960-01-04,1677-09-21 00:12:44,2262-04-11 23:47:16.854774475,1912-01-01
Binary file not shown.
Loading

0 comments on commit b8696f9

Please sign in to comment.