Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance selection of rows in DynamicTable #191

Merged
merged 13 commits into from
Nov 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 61 additions & 21 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ..utils import docval, getargs, ExtenderMeta, call_docval_func, popargs, pystr
from ..container import Container, Data
from collections import OrderedDict

from . import register_class

Expand Down Expand Up @@ -463,33 +464,60 @@ def create_region(self, **kwargs):
def __getitem__(self, key):
ret = None
if isinstance(key, tuple):
# index by row and column, return specific cell
# index by row and column --> return specific cell
arg1 = key[0]
arg2 = key[1]
if isinstance(arg2, str):
arg2 = self.__colids[arg2]
ret = self.__df_cols[arg2][arg1]
elif isinstance(key, str):
# index by one string --> return column
if key in self.__colids:
ret = self.__df_cols[self.__colids[key]]
elif key in self.__indices:
return self.__indices[key]
else:
raise KeyError(key)
else:
# index by int, list, or slice --> return pandas Dataframe consisting of one or more rows
# determine the key. If the key is an int, then turn it into a slice to reduce the number of cases below
arg = key
if isinstance(arg, str):
# index by one string, return column
if arg in self.__colids:
ret = self.__df_cols[self.__colids[arg]]
elif arg in self.__indices:
return self.__indices[arg]
else:
raise KeyError(arg)
elif np.issubdtype(type(arg), np.integer):
# index by int, return row
ret = tuple(col[arg] for col in self.__df_cols)
if np.issubdtype(type(arg), np.integer):
arg = np.s_[arg:(arg+1)]
# index with a python slice (or single integer) to select one or multiple rows
if isinstance(arg, slice):
data = OrderedDict()
for name in self.colnames:
col = self.__df_cols[self.__colids[name]]
if isinstance(col.data, (Dataset, np.ndarray)) and col.data.ndim > 1:
data[name] = [x for x in col[arg]]
else:
currdata = col[arg]
data[name] = currdata
id_index = self.id.data[arg]
if np.isscalar(id_index):
id_index = [id_index, ]
ret = pd.DataFrame(data, index=pd.Index(name=self.id.name, data=id_index), columns=self.colnames)
# index by a list of ints, return multiple rows
elif isinstance(arg, (tuple, list, np.ndarray)):
# index by a list of ints, return multiple rows
if isinstance(arg, np.ndarray):
if len(arg.shape) != 1:
raise ValueError("cannot index DynamicTable with multiple dimensions")
ret = list()
for i in arg:
ret.append(tuple(col[i] for col in self.__df_cols))
data = OrderedDict()
for name in self.colnames:
col = self.__df_cols[self.__colids[name]]
if isinstance(col.data, (Dataset, np.ndarray)) and col.data.ndim > 1:
data[name] = [x for x in col[arg]]
elif isinstance(col.data, np.ndarray):
data[name] = col[arg]
else:
data[name] = [col[i] for i in arg]
id_index = (self.id.data[arg]
if isinstance(self.id.data, np.ndarray)
else [self.id.data[i] for i in arg])
ret = pd.DataFrame(data, index=pd.Index(name=self.id.name, data=id_index), columns=self.colnames)
else:
raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg)))

return ret

Expand All @@ -501,11 +529,15 @@ def get(self, key, default=None):
return self[key]
return default

def to_dataframe(self, exclude=set([])):
'''Produce a pandas DataFrame containing this table's data.
'''

data = {}
@docval({'name': 'exclude', 'type': set, 'doc': ' List of columns to exclude from the dataframe', 'default': None})
def to_dataframe(self, **kwargs):
"""
Produce a pandas DataFrame containing this table's data.
"""
exclude = popargs('exclude', kwargs)
if exclude is None:
exclude = set([])
data = OrderedDict()
for name in self.colnames:
if name in exclude:
continue
Expand Down Expand Up @@ -660,3 +692,11 @@ def __getitem__(self, key):
return self.table[self.data[key]]
else:
raise ValueError("unrecognized argument: '%s'" % key)

@property
def shape(self):
"""
Define the shape, i.e., (num_rows, num_columns) of the selected table region
:return: Shape tuple with two integers indicating the number of rows and number of columns
"""
return (len(self.data), len(self.table.columns))
89 changes: 67 additions & 22 deletions tests/unit/common/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import pandas as pd
import numpy as np
from collections import OrderedDict


class TestDynamicTable(unittest.TestCase):
Expand Down Expand Up @@ -114,10 +115,40 @@ def test_getitem_row_num(self):
table = self.with_spec()
self.add_rows(table)
row = table[2]
self.assertEqual(row[0], 2)
self.assertEqual(row[1], 3)
self.assertEqual(row[2], 30.0)
self.assertEqual(row[3], 'bird')
self.assertTupleEqual(row.shape, (1, 3))
self.assertTupleEqual(tuple(row.iloc[0]), (3, 30.0, 'bird'))

def test_getitem_row_slice(self):
table = self.with_spec()
self.add_rows(table)
rows = table[1:3]
self.assertIsInstance(rows, pd.DataFrame)
self.assertTupleEqual(rows.shape, (2, 3))
self.assertTupleEqual(tuple(rows.iloc[1]), (3, 30.0, 'bird'))

def test_getitem_row_slice_with_step(self):
table = self.with_spec()
self.add_rows(table)
rows = table[0:5:2]
self.assertIsInstance(rows, pd.DataFrame)
self.assertTupleEqual(rows.shape, (3, 3))
self.assertEqual(rows.iloc[2][0], 5)
self.assertEqual(rows.iloc[2][1], 50.0)
self.assertEqual(rows.iloc[2][2], 'lizard')

def test_getitem_invalid_keytype(self):
table = self.with_spec()
self.add_rows(table)
with self.assertRaises(KeyError):
_ = table[0.1]

def test_getitem_col_select_and_row_slice(self):
table = self.with_spec()
self.add_rows(table)
col = table[1:3, 'bar']
self.assertEqual(len(col), 2)
self.assertEqual(col[0], 20.0)
self.assertEqual(col[1], 30.0)

def test_getitem_column(self):
table = self.with_spec()
Expand All @@ -134,16 +165,22 @@ def test_getitem_list_idx(self):
self.add_rows(table)
row = table[[0, 2, 4]]
self.assertEqual(len(row), 3)
self.assertEqual(row[0], (0, 1, 10.0, 'cat'))
self.assertEqual(row[1], (2, 3, 30.0, 'bird'))
self.assertEqual(row[2], (4, 5, 50.0, 'lizard'))
self.assertTupleEqual(tuple(row.iloc[0]), (1, 10.0, 'cat'))
self.assertTupleEqual(tuple(row.iloc[1]), (3, 30.0, 'bird'))
self.assertTupleEqual(tuple(row.iloc[2]), (5, 50.0, 'lizard'))

def test_getitem_point_idx_colname(self):
table = self.with_spec()
self.add_rows(table)
val = table[2, 'bar']
self.assertEqual(val, 30.0)

def test_getitem_point_idx(self):
table = self.with_spec()
self.add_rows(table)
row = table[2]
self.assertTupleEqual(tuple(row.iloc[0]), (3, 30.0, 'bird'))

def test_getitem_point_idx_colidx(self):
table = self.with_spec()
self.add_rows(table)
Expand All @@ -162,11 +199,15 @@ def test_pandas_roundtrip(self):

def test_to_dataframe(self):
table = self.with_columns_and_data()
expected_df = pd.DataFrame({
'foo': [1, 2, 3, 4, 5],
'bar': [10.0, 20.0, 30.0, 40.0, 50.0],
'baz': ['cat', 'dog', 'bird', 'fish', 'lizard']
})
data = OrderedDict()
for name in table.colnames:
if name == 'foo':
data[name] = [1, 2, 3, 4, 5]
elif name == 'bar':
data[name] = [10.0, 20.0, 30.0, 40.0, 50.0]
elif name == 'baz':
data[name] = ['cat', 'dog', 'bird', 'fish', 'lizard']
expected_df = pd.DataFrame(data)
obtained_df = table.to_dataframe()
self.assertTrue(expected_df.equals(obtained_df))

Expand Down Expand Up @@ -225,17 +266,20 @@ def test_extra_columns(self):

def test_indexed_dynamic_table_region(self):
table = self.with_columns_and_data()

dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 1], 'desc', table=table)
fetch_ids = [x[1] for x in dynamic_table_region[:3]]
self.assertEqual(fetch_ids, [1, 2, 2])
dynamic_table_region = DynamicTableRegion('dtr', [1, 2, 2], 'desc', table=table)
fetch_ids = dynamic_table_region[:3].index.values
self.assertListEqual(fetch_ids.tolist(), [1, 2, 2])

def test_dynamic_table_iteration(self):
table = self.with_columns_and_data()

dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 2, 3, 4], 'desc', table=table)
for ii, item in enumerate(dynamic_table_region):
self.assertEqual(table[ii], item)
self.assertTrue(table[ii].equals(item))

def test_dynamic_table_region_shape(self):
table = self.with_columns_and_data()
dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 2, 3, 4], 'desc', table=table)
self.assertTupleEqual(dynamic_table_region.shape, (5, 3))

def test_nd_array_to_df(self):
data = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
Expand All @@ -249,15 +293,16 @@ def test_id_search(self):
table = self.with_spec()
data = [{'foo': 1, 'bar': 10.0, 'baz': 'cat'},
{'foo': 2, 'bar': 20.0, 'baz': 'dog'},
{'foo': 3, 'bar': 30.0, 'baz': 'bird'},
{'foo': 3, 'bar': 30.0, 'baz': 'bird'}, # id=2
{'foo': 4, 'bar': 40.0, 'baz': 'fish'},
{'foo': 5, 'bar': 50.0, 'baz': 'lizard'}]
{'foo': 5, 'bar': 50.0, 'baz': 'lizard'} # id=4
]
for i in data:
table.add_row(i)
res = table[table.id == [2, 4]]
self.assertEqual(len(res), 2)
self.assertTupleEqual(res[0], (2, 3, 30.0, 'bird'))
self.assertTupleEqual(res[1], (4, 5, 50.0, 'lizard'))
self.assertTupleEqual(tuple(res.iloc[0]), (3, 30.0, 'bird'))
self.assertTupleEqual(tuple(res.iloc[1]), (5, 50.0, 'lizard'))


class TestDynamicTableRoundTrip(base.TestMapRoundTrip):
Expand Down
34 changes: 34 additions & 0 deletions tests/unit/utils_test/test_core_ShapeValidator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

from hdmf.data_utils import ShapeValidatorResult, DataChunkIterator, assertEqualShape
from hdmf.common.table import DynamicTable, DynamicTableRegion, VectorData
import numpy as np


Expand Down Expand Up @@ -166,6 +167,39 @@ def test_DataChunkIterator_error_on_undetermined_axis(self):
self.assertTupleEqual(res.axes1, (0, 1))
self.assertTupleEqual(res.axes2, (0, 1))

def test_DynamicTableRegion_shape_validation(self):
# Create a test DynamicTable
dt_spec = [
{'name': 'foo', 'description': 'foo column'},
{'name': 'bar', 'description': 'bar column'},
{'name': 'baz', 'description': 'baz column'},
]
dt_data = [
[1, 2, 3, 4, 5],
[10.0, 20.0, 30.0, 40.0, 50.0],
['cat', 'dog', 'bird', 'fish', 'lizard']
]
columns = [
VectorData(name=s['name'], description=s['description'], data=d)
for s, d in zip(dt_spec, dt_data)
]
dt = DynamicTable("with_columns_and_data",
"a test table", columns=columns)
# Create test DynamicTableRegion
dtr = DynamicTableRegion('dtr', [1, 2, 2], 'desc', table=dt)
# Confirm that the shapes match
res = assertEqualShape(dtr, np.arange(9).reshape(3, 3))
self.assertTrue(res.result)

def with_table_columns(self):
cols = [VectorData(**d) for d in self.spec]
table = DynamicTable("with_table_columns", 'a test table', columns=cols)
return table

def with_columns_and_data(self):

return


class ShapeValidatorResultTests(unittest.TestCase):

Expand Down