diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2854dbf5e655b..b627c938ecd92 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -79,6 +79,7 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) + .. _whatsnew_0182.api: API changes @@ -207,6 +208,7 @@ Bug Fixes - Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) +- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d350358081aa7..fcf5125d956c6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1314,12 +1314,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None, self.s = s self.func = func self.where = where - self.nrows = nrows or 0 - self.start = start or 0 - if stop is None: - stop = self.nrows - self.stop = min(self.nrows, stop) + # set start/stop if they are not set if we are a table + if self.s.is_table: + if nrows is None: + nrows = 0 + if start is None: + start = 0 + if stop is None: + stop = nrows + stop = min(nrows, stop) + + self.nrows = nrows + self.start = start + self.stop = stop self.coordinates = None if iterator or chunksize is not None: @@ -2303,14 +2311,23 @@ def f(values, freq=None, tz=None): return klass def validate_read(self, kwargs): - if kwargs.get('columns') is not None: + """ + remove table keywords from kwargs and return + raise if any keywords are passed which are not-None + """ + kwargs = copy.copy(kwargs) + + columns = kwargs.pop('columns', None) + if columns is not None: raise TypeError("cannot pass a column specification when reading " "a Fixed format store. this store must be " "selected in its entirety") - if kwargs.get('where') is not None: + where = kwargs.pop('where', None) + if where is not None: raise TypeError("cannot pass a where specification when reading " "from a Fixed format store. this store must be " "selected in its entirety") + return kwargs @property def is_exists(self): @@ -2329,11 +2346,11 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key): + def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables node = getattr(self.group, key) - data = node[:] + data = node[start:stop] attrs = node._v_attrs transposed = getattr(attrs, 'transposed', False) @@ -2363,17 +2380,17 @@ def read_array(self, key): else: return ret - def read_index(self, key): + def read_index(self, key, **kwargs): variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) if variety == u('multi'): - return self.read_multi_index(key) + return self.read_multi_index(key, **kwargs) elif variety == u('block'): - return self.read_block_index(key) + return self.read_block_index(key, **kwargs) elif variety == u('sparseint'): - return self.read_sparse_intindex(key) + return self.read_sparse_intindex(key, **kwargs) elif variety == u('regular'): - _, index = self.read_index_node(getattr(self.group, key)) + _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover raise TypeError('unrecognized index variety: %s' % variety) @@ -2411,19 +2428,19 @@ def write_block_index(self, key, index): self.write_array('%s_blengths' % key, index.blengths) setattr(self.attrs, '%s_length' % key, index.length) - def read_block_index(self, key): + def read_block_index(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - blocs = self.read_array('%s_blocs' % key) - blengths = self.read_array('%s_blengths' % key) + blocs = self.read_array('%s_blocs' % key, **kwargs) + blengths = self.read_array('%s_blengths' % key, **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): self.write_array('%s_indices' % key, index.indices) setattr(self.attrs, '%s_length' % key, index.length) - def read_sparse_intindex(self, key): + def read_sparse_intindex(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - indices = self.read_array('%s_indices' % key) + indices = self.read_array('%s_indices' % key, **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): @@ -2448,7 +2465,7 @@ def write_multi_index(self, key, index): label_key = '%s_label%d' % (key, i) self.write_array(label_key, lab) - def read_multi_index(self, key): + def read_multi_index(self, key, **kwargs): nlevels = getattr(self.attrs, '%s_nlevels' % key) levels = [] @@ -2456,19 +2473,20 @@ def read_multi_index(self, key): names = [] for i in range(nlevels): level_key = '%s_level%d' % (key, i) - name, lev = self.read_index_node(getattr(self.group, level_key)) + name, lev = self.read_index_node(getattr(self.group, level_key), + **kwargs) levels.append(lev) names.append(name) label_key = '%s_label%d' % (key, i) - lab = self.read_array(label_key) + lab = self.read_array(label_key, **kwargs) labels.append(lab) return MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=True) - def read_index_node(self, node): - data = node[:] + def read_index_node(self, node, start=None, stop=None): + data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. if ('shape' in node._v_attrs and @@ -2607,9 +2625,9 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key): + def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) - data = node[:] + data = node[start:stop] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) @@ -2617,7 +2635,7 @@ def read_index_legacy(self, key): class LegacySeriesFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') values = self.read_array('values') return Series(values, index=index) @@ -2626,7 +2644,7 @@ def read(self, **kwargs): class LegacyFrameFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') columns = self.read_index_legacy('columns') values = self.read_array('values') @@ -2645,9 +2663,9 @@ def shape(self): return None def read(self, **kwargs): - self.validate_read(kwargs) - index = self.read_index('index') - values = self.read_array('values') + kwargs = self.validate_read(kwargs) + index = self.read_index('index', **kwargs) + values = self.read_array('values', **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -2657,12 +2675,25 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseSeriesFixed(GenericFixed): +class SparseFixed(GenericFixed): + + def validate_read(self, kwargs): + """ + we don't support start, stop kwds in Sparse + """ + kwargs = super(SparseFixed, self).validate_read(kwargs) + if 'start' in kwargs or 'stop' in kwargs: + raise NotImplementedError("start and/or stop are not supported " + "in fixed Sparse reading") + return kwargs + + +class SparseSeriesFixed(SparseFixed): pandas_kind = u('sparse_series') attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index('index') sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') @@ -2681,12 +2712,12 @@ def write(self, obj, **kwargs): self.attrs.kind = obj.kind -class SparseFrameFixed(GenericFixed): +class SparseFrameFixed(SparseFixed): pandas_kind = u('sparse_frame') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) columns = self.read_index('columns') sdict = {} for c in columns: @@ -2714,12 +2745,12 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) -class SparsePanelFixed(GenericFixed): +class SparsePanelFixed(SparseFixed): pandas_kind = u('sparse_panel') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) items = self.read_index('items') sdict = {} @@ -2782,19 +2813,26 @@ def shape(self): except: return None - def read(self, **kwargs): - self.validate_read(kwargs) + def read(self, start=None, stop=None, **kwargs): + # start, stop applied to rows, so 0th axis only + + kwargs = self.validate_read(kwargs) + select_axis = self.obj_type()._get_block_manager_axis(0) axes = [] for i in range(self.ndim): - ax = self.read_index('axis%d' % i) + + _start, _stop = (start, stop) if i == select_axis else (None, None) + ax = self.read_index('axis%d' % i, start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): + blk_items = self.read_index('block%d_items' % i) - values = self.read_array('block%d_values' % i) + values = self.read_array('block%d_values' % i, + start=_start, stop=_stop) blk = make_block(values, placement=items.get_indexer(blk_items)) blocks.append(blk) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 5ee84ce97979a..4c72a47dbdf6e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self): result = store.select('df', where='values>2.0') assert_frame_equal(result, expected) - def test_start_stop(self): + def test_start_stop_table(self): with ensure_clean_store(self.path) as store: + # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) store.append('df', df) @@ -4143,8 +4144,55 @@ def test_start_stop(self): # out of range result = store.select( 'df', [Term("columns=['A']")], start=30, stop=40) - assert(len(result) == 0) - assert(type(result) == DataFrame) + self.assertTrue(len(result) == 0) + expected = df.ix[30:40, ['A']] + tm.assert_frame_equal(result, expected) + + def test_start_stop_fixed(self): + + with ensure_clean_store(self.path) as store: + + # fixed, GH 8287 + df = DataFrame(dict(A=np.random.rand(20), + B=np.random.rand(20)), + index=pd.date_range('20130101', periods=20)) + store.put('df', df) + + result = store.select( + 'df', start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select( + 'df', start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select( + 'df', start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put('s', s) + result = store.select('s', start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select('s', start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.ix[3:5, 1:3] = np.nan + df.ix[8:10, -2] = np.nan + dfs = df.to_sparse() + store.put('dfs', dfs) + with self.assertRaises(NotImplementedError): + store.select('dfs', start=0, stop=5) def test_select_filter_corner(self):