Skip to content

Commit

Permalink
enh: favor array operations in yield_filtered_array_stacks (~4x spe…
Browse files Browse the repository at this point in the history
…ed-up)
  • Loading branch information
paulmueller committed Jan 5, 2024
1 parent 3ecb937 commit ae0711d
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 20 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
0.56.4
0.57.0
- fix: integer overflow in downsample_grid
- fix: removed unnecessary computation of hierarchy filter instance
- enh: cythonize downsample_grid (~10x speed-up)
Expand All @@ -9,6 +9,7 @@
- enh: globally use chunk sizes of ~1MiB when writing HDF5 data
(minor speedup since previously a chunk size of 100 events was used
for images and scalar features were written in one big chunk)
- enh: favor array operations in `yield_filtered_array_stacks` (~4x speed-up)
- ref: implement `__array__` methods for hierarchy event classes
- ref: implement `__array__` method for `H5MaskEvent`
- ref: new submodule for hierarchy format
Expand Down
60 changes: 43 additions & 17 deletions dclab/rtdc_dataset/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,10 +397,12 @@ def yield_filtered_array_stacks(data, indices):
Parameters
----------
data: np.ndarray or h5py.Dataset
The full, unfiltered input feature data.
The full, unfiltered input feature data. Must implement
the `shape` and `dtype` properties. If it implements the
`__array__` method, fast slicing is used.
indices: np.ndarray or list
The indices in data (first axis) that should be written
to the chunks returned by this generator.
The indices (integer values) for `data` (first axis), indicating
which elements should be returned by this generator.
Notes
-----
Expand All @@ -410,24 +412,48 @@ def yield_filtered_array_stacks(data, indices):
and that the events in `data` all have the same shape.
The dtype of the returned chunks is determined by the first
item in `data`.
This method works with sliceable (e.g. np.ndarray) and
non-sliceable (e.g. tdms-format-based images) input data. If the
input data is sliceable (which is determined by the availability
of the `__array__` method, then fast numpy sclicing is used. If the
input data does not support slicing (`__array__` not defined), then
a slow iteration over `indices` is done.
In the slow iteration case, the returned array data are overridden
in-place. If you need to retain a copy of the `yield`ed chunks,
apply `np.array(.., copy=True)` to the returned chunks.
"""
chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:],
item_dtype=data.dtype)
chunk_size = chunk_shape[0]
# assemble filtered image stacks
chunk = np.zeros(chunk_shape, dtype=data.dtype)

jj = 0
for ii in indices:
chunk[jj] = data[ii]
if (jj + 1) % chunk_size == 0:
jj = 0
yield chunk
else:
jj += 1
# yield remainder
if jj:
yield chunk[:jj]

if hasattr(data, "__array__"):
# We have an array-like object and can do slicing with the indexing
# array. This speeds up chunk creation for e.g. the HDF5 file format
# where all data are present in an array-like fashion.
indices = np.array(indices)
stop = 0
for kk in range(len(indices) // chunk_size):
start = chunk_size * kk
stop = chunk_size * (kk + 1)
yield data[indices[start:stop]]
if stop < len(indices):
yield data[indices[stop:]]
else:
# assemble filtered image stacks
chunk = np.zeros(chunk_shape, dtype=data.dtype)
jj = 0
for ii in indices:
chunk[jj] = data[ii]
if (jj + 1) % chunk_size == 0:
jj = 0
yield chunk
else:
jj += 1
# yield remainder
if jj:
yield chunk[:jj]


def store_filtered_feature(rtdc_writer, feat, data, filtarr):
Expand Down
69 changes: 68 additions & 1 deletion tests/test_export.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
import io
import os
from os.path import join
Expand All @@ -10,7 +11,8 @@

import dclab
from dclab import dfn, new_dataset, RTDCWriter
from dclab.rtdc_dataset.export import store_filtered_feature
from dclab.rtdc_dataset.export import (
store_filtered_feature, yield_filtered_array_stacks)

from helper_methods import example_data_dict, retrieve_data

Expand Down Expand Up @@ -661,3 +663,68 @@ def test_tsv_not_filtered():
edest = tempfile.mkdtemp()
f1 = join(edest, "test.tsv")
ds.export.tsv(f1, keys, filtered=False)


def test_yield_filtered_array_stacks_array():
enum = np.arange(547)
ebol = np.ones(547, dtype=bool)
ebol[10] = False
ebol[412] = False

data = np.random.random((547, 80, 320))
indices = enum[ebol]
stacked = []
for chunk in yield_filtered_array_stacks(data, indices):
stacked.append(chunk)
assert len(stacked) == 55
assert len(stacked[0]) == len(stacked[1])
assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0])

data2 = np.concatenate(stacked, axis=0)
assert len(data2) == 547 - 2
assert data2[0].shape == data[0].shape
assert np.all(data[indices] == data2)


def test_yield_filtered_array_stacks_list():
# custom list we will be using, which implements shape and dtype, but
# no __array__.
class ListFeat(collections.UserList):
def __init__(self, data):
super(ListFeat, self).__init__(data)

@property
def shape(self):
return tuple([len(self.data)] + list(self.data[0].shape))

@property
def dtype(self):
return self.data[0].dtype

enum = np.arange(547, dtype=int)
ebol = np.ones(547, dtype=bool)
ebol[10] = False
ebol[412] = False

# instead of the above case, create a list of arrays here.
datalist = []
for _ in range(547):
datalist.append(np.random.random((80, 320)))
data = ListFeat(datalist)
assert data.shape == (547, 80, 320)
assert data.dtype == datalist[0].dtype
assert np.all(data[0] == datalist[0])

indices = enum[ebol]
stacked = []
for chunk in yield_filtered_array_stacks(data, indices):
stacked.append(np.array(chunk, copy=True))
assert len(stacked) == 55
assert len(stacked[0]) == len(stacked[1])
assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0])
assert np.all(stacked[0][0] == datalist[0])

data2 = np.concatenate(stacked, axis=0)
assert len(data2) == 547 - 2
assert data2[0].shape == data[0].shape
assert np.all(np.array(datalist)[indices] == data2)
2 changes: 1 addition & 1 deletion tests/test_rtdc_fmt_hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def test_defective_feature_volume():
def test_discouraged_array_dunder_childndarray():
ds = new_dataset(retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip"))
with pytest.warns(UserWarning, match="It may consume a lot of memory"):
maskbool = ds["mask"].__array__()
ds["mask"].__array__()


@pytest.mark.filterwarnings(
Expand Down

0 comments on commit ae0711d

Please sign in to comment.