Skip to content

BUG: Bug in selection from a HDFStore with a fixed format and start and/or stop will now return the selected range #13267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ Other enhancements
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)


.. _whatsnew_0182.api:

API changes
Expand Down Expand Up @@ -207,6 +208,7 @@ Bug Fixes
- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)


- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
Expand Down
120 changes: 79 additions & 41 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1314,12 +1314,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None,
self.s = s
self.func = func
self.where = where
self.nrows = nrows or 0
self.start = start or 0

if stop is None:
stop = self.nrows
self.stop = min(self.nrows, stop)
# set start/stop if they are not set if we are a table
if self.s.is_table:
if nrows is None:
nrows = 0
if start is None:
start = 0
if stop is None:
stop = nrows
stop = min(nrows, stop)

self.nrows = nrows
self.start = start
self.stop = stop

self.coordinates = None
if iterator or chunksize is not None:
Expand Down Expand Up @@ -2303,14 +2311,23 @@ def f(values, freq=None, tz=None):
return klass

def validate_read(self, kwargs):
if kwargs.get('columns') is not None:
"""
remove table keywords from kwargs and return
raise if any keywords are passed which are not-None
"""
kwargs = copy.copy(kwargs)

columns = kwargs.pop('columns', None)
if columns is not None:
raise TypeError("cannot pass a column specification when reading "
"a Fixed format store. this store must be "
"selected in its entirety")
if kwargs.get('where') is not None:
where = kwargs.pop('where', None)
if where is not None:
raise TypeError("cannot pass a where specification when reading "
"from a Fixed format store. this store must be "
"selected in its entirety")
return kwargs

@property
def is_exists(self):
Expand All @@ -2329,11 +2346,11 @@ def get_attrs(self):
def write(self, obj, **kwargs):
self.set_attrs()

def read_array(self, key):
def read_array(self, key, start=None, stop=None):
""" read an array for the specified node (off of group """
import tables
node = getattr(self.group, key)
data = node[:]
data = node[start:stop]
attrs = node._v_attrs

transposed = getattr(attrs, 'transposed', False)
Expand Down Expand Up @@ -2363,17 +2380,17 @@ def read_array(self, key):
else:
return ret

def read_index(self, key):
def read_index(self, key, **kwargs):
variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key))

if variety == u('multi'):
return self.read_multi_index(key)
return self.read_multi_index(key, **kwargs)
elif variety == u('block'):
return self.read_block_index(key)
return self.read_block_index(key, **kwargs)
elif variety == u('sparseint'):
return self.read_sparse_intindex(key)
return self.read_sparse_intindex(key, **kwargs)
elif variety == u('regular'):
_, index = self.read_index_node(getattr(self.group, key))
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
return index
else: # pragma: no cover
raise TypeError('unrecognized index variety: %s' % variety)
Expand Down Expand Up @@ -2411,19 +2428,19 @@ def write_block_index(self, key, index):
self.write_array('%s_blengths' % key, index.blengths)
setattr(self.attrs, '%s_length' % key, index.length)

def read_block_index(self, key):
def read_block_index(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
blocs = self.read_array('%s_blocs' % key)
blengths = self.read_array('%s_blengths' % key)
blocs = self.read_array('%s_blocs' % key, **kwargs)
blengths = self.read_array('%s_blengths' % key, **kwargs)
return BlockIndex(length, blocs, blengths)

def write_sparse_intindex(self, key, index):
self.write_array('%s_indices' % key, index.indices)
setattr(self.attrs, '%s_length' % key, index.length)

def read_sparse_intindex(self, key):
def read_sparse_intindex(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
indices = self.read_array('%s_indices' % key)
indices = self.read_array('%s_indices' % key, **kwargs)
return IntIndex(length, indices)

def write_multi_index(self, key, index):
Expand All @@ -2448,27 +2465,28 @@ def write_multi_index(self, key, index):
label_key = '%s_label%d' % (key, i)
self.write_array(label_key, lab)

def read_multi_index(self, key):
def read_multi_index(self, key, **kwargs):
nlevels = getattr(self.attrs, '%s_nlevels' % key)

levels = []
labels = []
names = []
for i in range(nlevels):
level_key = '%s_level%d' % (key, i)
name, lev = self.read_index_node(getattr(self.group, level_key))
name, lev = self.read_index_node(getattr(self.group, level_key),
**kwargs)
levels.append(lev)
names.append(name)

label_key = '%s_label%d' % (key, i)
lab = self.read_array(label_key)
lab = self.read_array(label_key, **kwargs)
labels.append(lab)

return MultiIndex(levels=levels, labels=labels, names=names,
verify_integrity=True)

def read_index_node(self, node):
data = node[:]
def read_index_node(self, node, start=None, stop=None):
data = node[start:stop]
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we relace it with the original.
if ('shape' in node._v_attrs and
Expand Down Expand Up @@ -2607,17 +2625,17 @@ def write_array(self, key, value, items=None):

class LegacyFixed(GenericFixed):

def read_index_legacy(self, key):
def read_index_legacy(self, key, start=None, stop=None):
node = getattr(self.group, key)
data = node[:]
data = node[start:stop]
kind = node._v_attrs.kind
return _unconvert_index_legacy(data, kind, encoding=self.encoding)


class LegacySeriesFixed(LegacyFixed):

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
values = self.read_array('values')
return Series(values, index=index)
Expand All @@ -2626,7 +2644,7 @@ def read(self, **kwargs):
class LegacyFrameFixed(LegacyFixed):

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
columns = self.read_index_legacy('columns')
values = self.read_array('values')
Expand All @@ -2645,9 +2663,9 @@ def shape(self):
return None

def read(self, **kwargs):
self.validate_read(kwargs)
index = self.read_index('index')
values = self.read_array('values')
kwargs = self.validate_read(kwargs)
index = self.read_index('index', **kwargs)
values = self.read_array('values', **kwargs)
return Series(values, index=index, name=self.name)

def write(self, obj, **kwargs):
Expand All @@ -2657,12 +2675,25 @@ def write(self, obj, **kwargs):
self.attrs.name = obj.name


class SparseSeriesFixed(GenericFixed):
class SparseFixed(GenericFixed):

def validate_read(self, kwargs):
"""
we don't support start, stop kwds in Sparse
"""
kwargs = super(SparseFixed, self).validate_read(kwargs)
if 'start' in kwargs or 'stop' in kwargs:
raise NotImplementedError("start and/or stop are not supported "
"in fixed Sparse reading")
return kwargs


class SparseSeriesFixed(SparseFixed):
pandas_kind = u('sparse_series')
attributes = ['name', 'fill_value', 'kind']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index('index')
sp_values = self.read_array('sp_values')
sp_index = self.read_index('sp_index')
Expand All @@ -2681,12 +2712,12 @@ def write(self, obj, **kwargs):
self.attrs.kind = obj.kind


class SparseFrameFixed(GenericFixed):
class SparseFrameFixed(SparseFixed):
pandas_kind = u('sparse_frame')
attributes = ['default_kind', 'default_fill_value']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
columns = self.read_index('columns')
sdict = {}
for c in columns:
Expand Down Expand Up @@ -2714,12 +2745,12 @@ def write(self, obj, **kwargs):
self.write_index('columns', obj.columns)


class SparsePanelFixed(GenericFixed):
class SparsePanelFixed(SparseFixed):
pandas_kind = u('sparse_panel')
attributes = ['default_kind', 'default_fill_value']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
items = self.read_index('items')

sdict = {}
Expand Down Expand Up @@ -2782,19 +2813,26 @@ def shape(self):
except:
return None

def read(self, **kwargs):
self.validate_read(kwargs)
def read(self, start=None, stop=None, **kwargs):
# start, stop applied to rows, so 0th axis only

kwargs = self.validate_read(kwargs)
select_axis = self.obj_type()._get_block_manager_axis(0)

axes = []
for i in range(self.ndim):
ax = self.read_index('axis%d' % i)

_start, _stop = (start, stop) if i == select_axis else (None, None)
ax = self.read_index('axis%d' % i, start=_start, stop=_stop)
axes.append(ax)

items = axes[0]
blocks = []
for i in range(self.nblocks):

blk_items = self.read_index('block%d_items' % i)
values = self.read_array('block%d_values' % i)
values = self.read_array('block%d_values' % i,
start=_start, stop=_stop)
blk = make_block(values,
placement=items.get_indexer(blk_items))
blocks.append(blk)
Expand Down
54 changes: 51 additions & 3 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self):
result = store.select('df', where='values>2.0')
assert_frame_equal(result, expected)

def test_start_stop(self):
def test_start_stop_table(self):

with ensure_clean_store(self.path) as store:

# table
df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
store.append('df', df)

Expand All @@ -4143,8 +4144,55 @@ def test_start_stop(self):
# out of range
result = store.select(
'df', [Term("columns=['A']")], start=30, stop=40)
assert(len(result) == 0)
assert(type(result) == DataFrame)
self.assertTrue(len(result) == 0)
expected = df.ix[30:40, ['A']]
tm.assert_frame_equal(result, expected)

def test_start_stop_fixed(self):

with ensure_clean_store(self.path) as store:

# fixed, GH 8287
df = DataFrame(dict(A=np.random.rand(20),
B=np.random.rand(20)),
index=pd.date_range('20130101', periods=20))
store.put('df', df)

result = store.select(
'df', start=0, stop=5)
expected = df.iloc[0:5, :]
tm.assert_frame_equal(result, expected)

result = store.select(
'df', start=5, stop=10)
expected = df.iloc[5:10, :]
tm.assert_frame_equal(result, expected)

# out of range
result = store.select(
'df', start=30, stop=40)
expected = df.iloc[30:40, :]
tm.assert_frame_equal(result, expected)

# series
s = df.A
store.put('s', s)
result = store.select('s', start=0, stop=5)
expected = s.iloc[0:5]
tm.assert_series_equal(result, expected)

result = store.select('s', start=5, stop=10)
expected = s.iloc[5:10]
tm.assert_series_equal(result, expected)

# sparse; not implemented
df = tm.makeDataFrame()
df.ix[3:5, 1:3] = np.nan
df.ix[8:10, -2] = np.nan
dfs = df.to_sparse()
store.put('dfs', dfs)
with self.assertRaises(NotImplementedError):
store.select('dfs', start=0, stop=5)

def test_select_filter_corner(self):

Expand Down