diff --git a/CHANGELOG b/CHANGELOG index e6c2316b..0c2de906 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,5 @@ 0.54.3 + - fix: replace lru_cache on instance methods to fix memory leak (#214) - fix: implement `__contains__` for DCOR logs and tables - enh: add requests timeout for DCOR data - enh: more caching of event size and shape for HDF5 format diff --git a/dclab/rtdc_dataset/core.py b/dclab/rtdc_dataset/core.py index 06e269a2..3d195365 100644 --- a/dclab/rtdc_dataset/core.py +++ b/dclab/rtdc_dataset/core.py @@ -38,6 +38,8 @@ def __init__(self, identifier=None, enable_basins=True): #: Dataset format (derived from class name) self.format = self.__class__.__name__.split("_")[-1].lower() + # Cache attribute used for __len__()-function + self._length = None self._polygon_filter_ids = [] # Events have the feature name as keys and contain nD ndarrays. self._events = {} @@ -135,15 +137,20 @@ def __iter__(self): yield col def __len__(self): + if self._length is None: + self._length = self._get_length() + return self._length + + def _get_length(self): # Try to get length from metadata. length = self.config["experiment"].get("event count") - if length: + if length is not None: return length # Try to get the length from the feature sizes - keys = list(self._events.keys()) + keys = list(self._events.keys()) or self.features_basin keys.sort() for kk in keys: - length = len(self._events[kk]) + length = len(self[kk]) if length: return length else: diff --git a/dclab/rtdc_dataset/fmt_dcor/logs.py b/dclab/rtdc_dataset/fmt_dcor/logs.py index 1772e766..ff57f158 100644 --- a/dclab/rtdc_dataset/fmt_dcor/logs.py +++ b/dclab/rtdc_dataset/fmt_dcor/logs.py @@ -1,9 +1,7 @@ -import functools - - class DCORLogs: def __init__(self, api): self.api = api + self._logs_cache = None def __contains__(self, key): return key in self.keys() @@ -18,6 +16,7 @@ def keys(self): return self._logs.keys() @property - @functools.lru_cache() def _logs(self): - return self.api.get(query="logs") + if self._logs_cache is None: + self._logs_cache = self.api.get(query="logs") + return self._logs_cache diff --git a/dclab/rtdc_dataset/fmt_dcor/tables.py b/dclab/rtdc_dataset/fmt_dcor/tables.py index adabb2e2..cf0b248f 100644 --- a/dclab/rtdc_dataset/fmt_dcor/tables.py +++ b/dclab/rtdc_dataset/fmt_dcor/tables.py @@ -1,11 +1,10 @@ -import functools - import numpy as np class DCORTables: def __init__(self, api): self.api = api + self._tables_cache = None def __contains__(self, key): return key in self.keys() @@ -20,17 +19,18 @@ def keys(self): return self._tables.keys() @property - @functools.lru_cache() def _tables(self): - table_data = self.api.get(query="tables") - # assemble the tables - tables = {} - for key in table_data: - columns, data = table_data[key] - ds_dt = np.dtype({'names': columns, - 'formats': [np.float64] * len(columns)}) - tab_data = np.asarray(data) - rec_arr = np.rec.array(tab_data, dtype=ds_dt) - tables[key] = rec_arr - - return tables + if self._tables_cache is None: + table_data = self.api.get(query="tables") + # assemble the tables + tables = {} + for key in table_data: + columns, data = table_data[key] + ds_dt = np.dtype({'names': columns, + 'formats': [np.float64] * len(columns)}) + tab_data = np.asarray(data) + rec_arr = np.rec.array(tab_data, dtype=ds_dt) + tables[key] = rec_arr + + self._tables_cache = tables + return self._tables_cache diff --git a/dclab/rtdc_dataset/fmt_hdf5/base.py b/dclab/rtdc_dataset/fmt_hdf5/base.py index c59af0da..913d4d7e 100644 --- a/dclab/rtdc_dataset/fmt_hdf5/base.py +++ b/dclab/rtdc_dataset/fmt_hdf5/base.py @@ -1,7 +1,6 @@ """RT-DC hdf5 format""" from __future__ import annotations -import functools import json import pathlib from typing import Any, BinaryIO, Dict @@ -118,14 +117,6 @@ def close(self): if b._ds is not None: b._ds.close() - @functools.lru_cache() - def __len__(self): - ec = self.h5file.get("experiment:event count") - if ec is not None: - return ec - else: - return super(RTDC_HDF5, self).__len__() - @property def _h5(self): warnings.warn("Access to the underlying HDF5 file is now public. " diff --git a/dclab/rtdc_dataset/fmt_hdf5/events.py b/dclab/rtdc_dataset/fmt_hdf5/events.py index 4f0cfbb7..e12cc05d 100644 --- a/dclab/rtdc_dataset/fmt_hdf5/events.py +++ b/dclab/rtdc_dataset/fmt_hdf5/events.py @@ -1,7 +1,6 @@ """RT-DC hdf5 format""" from __future__ import annotations -import functools import numbers import numpy as np @@ -17,6 +16,7 @@ def __init__(self, h5group, length=None): self.h5group = h5group # for hashing in util.obj2bytes self.identifier = (h5group.file.filename, h5group["0"].name) + self._length = None def __getitem__(self, key): if not isinstance(key, numbers.Integral): @@ -56,6 +56,7 @@ def __init__(self, h5): # datasets, we cache the wrapping classes in the `self._cached_events` # dictionary. self._cached_events = {} + self._defective_features = {} self._features_list = None @property @@ -97,15 +98,17 @@ def __iter__(self): for key in self.keys(): yield key - @functools.lru_cache() def _is_defective_feature(self, feat): """Whether the stored feature is defective""" - defective = False - if feat in feat_defect.DEFECTIVE_FEATURES and feat in self._features: - # feature exists in the HDF5 file - # workaround machinery for sorting out defective features - defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file) - return defective + if feat not in self._defective_features: + defective = False + if (feat in feat_defect.DEFECTIVE_FEATURES + and feat in self._features): + # feature exists in the HDF5 file + # workaround machinery for sorting out defective features + defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file) + self._defective_features[feat] = defective + return self._defective_features[feat] def keys(self): """Returns list of valid features diff --git a/dclab/rtdc_dataset/fmt_hdf5/logs.py b/dclab/rtdc_dataset/fmt_hdf5/logs.py index a53b9d1d..3e2d2d84 100644 --- a/dclab/rtdc_dataset/fmt_hdf5/logs.py +++ b/dclab/rtdc_dataset/fmt_hdf5/logs.py @@ -1,9 +1,7 @@ -import functools - - class H5Logs: def __init__(self, h5): self.h5file = h5 + self._cache_keys = None def __getitem__(self, key): if key in self.keys(): @@ -24,11 +22,12 @@ def __iter__(self): def __len__(self): return len(self.keys()) - @functools.lru_cache() def keys(self): - names = [] - if "logs" in self.h5file: - for key in self.h5file["logs"]: - if self.h5file["logs"][key].size: - names.append(key) - return names + if self._cache_keys is None: + names = [] + if "logs" in self.h5file: + for key in self.h5file["logs"]: + if self.h5file["logs"][key].size: + names.append(key) + self._cache_keys = names + return self._cache_keys diff --git a/dclab/rtdc_dataset/fmt_hdf5/tables.py b/dclab/rtdc_dataset/fmt_hdf5/tables.py index cd465175..27aa1215 100644 --- a/dclab/rtdc_dataset/fmt_hdf5/tables.py +++ b/dclab/rtdc_dataset/fmt_hdf5/tables.py @@ -1,9 +1,7 @@ -import functools - - class H5Tables: def __init__(self, h5): self.h5file = h5 + self._cache_keys = None def __getitem__(self, key): if key in self.keys(): @@ -21,11 +19,12 @@ def __iter__(self): def __len__(self): return len(self.keys()) - @functools.lru_cache() def keys(self): - names = [] - if "tables" in self.h5file: - for key in self.h5file["tables"]: - if self.h5file["tables"][key].size: - names.append(key) - return names + if self._cache_keys is None: + names = [] + if "tables" in self.h5file: + for key in self.h5file["tables"]: + if self.h5file["tables"][key].size: + names.append(key) + self._cache_keys = names + return self._cache_keys diff --git a/dclab/rtdc_dataset/fmt_hierarchy.py b/dclab/rtdc_dataset/fmt_hierarchy.py index d209e009..259d4c13 100644 --- a/dclab/rtdc_dataset/fmt_hierarchy.py +++ b/dclab/rtdc_dataset/fmt_hierarchy.py @@ -1,6 +1,5 @@ """RT-DC hierarchy format""" import collections -import functools import numpy as np @@ -310,6 +309,8 @@ def __init__(self, hparent, apply_filter=True, *args, **kwargs): # This will also populate all event attributes self.apply_filter() + self._length = None + def __contains__(self, key): return self.hparent.__contains__(key) @@ -336,9 +337,10 @@ def __getitem__(self, feat): + "root parent of this hierarchy child).") return data - @functools.lru_cache() def __len__(self): - return np.sum(self.hparent.filter.all) + if self._length is None: + self._length = np.sum(self.hparent.filter.all) + return self._length def _assert_filter(self): """Make sure filters exists @@ -426,7 +428,7 @@ def apply_filter(self, *args, **kwargs): self.hparent.apply_filter(*args, **kwargs) # Clear anything that has been cached until now - self.__len__.cache_clear() + self._length = None # update event index event_count = len(self) diff --git a/dclab/rtdc_dataset/fmt_tdms/event_contour.py b/dclab/rtdc_dataset/fmt_tdms/event_contour.py index 0906c52c..c6b900b8 100644 --- a/dclab/rtdc_dataset/fmt_tdms/event_contour.py +++ b/dclab/rtdc_dataset/fmt_tdms/event_contour.py @@ -1,5 +1,4 @@ """Class for efficiently handling contour data""" -import functools import numbers import sys import warnings @@ -58,6 +57,7 @@ def __init__(self, rtdc_dataset): self.pxfeat[key] = rtdc_dataset[key] / px_size self.event_offset = 0 + self._length = None def __getitem__(self, idx): if not isinstance(idx, numbers.Integral): @@ -127,14 +127,15 @@ def __getitem__(self, idx): ) return cdata - @functools.lru_cache(maxsize=1) def __len__(self): - length = len(self._contour_data) - if length: - if not self._initialized: - self.determine_offset() - length += self.event_offset - return length + if self._length is None: + length = len(self._contour_data) + if length: + if not self._initialized: + self.determine_offset() + length += self.event_offset + self._length = length + return self._length @property def shape(self): @@ -209,6 +210,7 @@ def __init__(self, fname): """ self._initialized = False self.filename = fname + self._length = None def __getitem__(self, idx): cont = self.data[idx] @@ -225,9 +227,10 @@ def __getitem__(self, idx): data = np.fromstring(cont, sep=",", dtype=np.uint16).reshape(-1, 2) return data - @functools.lru_cache(maxsize=1) def __len__(self): - return len(self.data) + if self._length is None: + self._length = len(self.data) + return self._length def _index_file(self): """Open and index the contour file diff --git a/dclab/rtdc_dataset/fmt_tdms/event_image.py b/dclab/rtdc_dataset/fmt_tdms/event_image.py index 22f3a511..b15fcde7 100644 --- a/dclab/rtdc_dataset/fmt_tdms/event_image.py +++ b/dclab/rtdc_dataset/fmt_tdms/event_image.py @@ -1,7 +1,6 @@ """ Class for efficiently handling image/video data """ -import functools import numbers import pathlib import sys @@ -34,6 +33,7 @@ def __init__(self, rtdc_dataset): conf = rtdc_dataset.config self.event_offset = int(conf["fmt_tdms"]["video frame offset"]) self.video_file = fname + self._shape = None def __getitem__(self, idx): if not isinstance(idx, numbers.Integral): @@ -79,10 +79,11 @@ def dummy(self): return cdata @property - @functools.lru_cache() def shape(self): - f0 = self._image_data[0].shape - return len(self), f0[0], f0[1] + if self._shape is None: + f0 = self._image_data[0].shape + self._shape = len(self), f0[0], f0[1] + return self._shape @staticmethod def find_video_file(rtdc_dataset): diff --git a/dclab/rtdc_dataset/fmt_tdms/event_mask.py b/dclab/rtdc_dataset/fmt_tdms/event_mask.py index c1096f11..fe4ef22e 100644 --- a/dclab/rtdc_dataset/fmt_tdms/event_mask.py +++ b/dclab/rtdc_dataset/fmt_tdms/event_mask.py @@ -1,5 +1,4 @@ """Class for on-the-fly conversion of contours to masks""" -import functools import numbers import numpy as np @@ -13,6 +12,8 @@ def __init__(self, rtdc_dataset): self.image = rtdc_dataset["image"] self.identifier = self.contour.identifier self.config = rtdc_dataset.config + self._shape = None + self._img_shape_cache = None def __getitem__(self, idx): if not isinstance(idx, numbers.Integral): @@ -35,24 +36,26 @@ def __len__(self): return lc @property - @functools.lru_cache() def _img_shape(self): - """Shape of one event image""" - cfgim = self.config["imaging"] - if self.image: - # get shape from image column - event_image_shape = self.image.shape[1:] - elif "roi size x" in cfgim and "roi size y" in cfgim: - # get shape from config (this is less reliable than getting - # the shape from the image; there were measurements with - # wrong config keys) - event_image_shape = (cfgim["roi size y"], cfgim["roi size x"]) - else: - # no shape available - event_image_shape = (0, 0) - return event_image_shape + if self._img_shape_cache is None: + """Shape of one event image""" + cfgim = self.config["imaging"] + if self.image: + # get shape from image column + event_image_shape = self.image.shape[1:] + elif "roi size x" in cfgim and "roi size y" in cfgim: + # get shape from config (this is less reliable than getting + # the shape from the image; there were measurements with + # wrong config keys) + event_image_shape = (cfgim["roi size y"], cfgim["roi size x"]) + else: + # no shape available + event_image_shape = (0, 0) + self._img_shape_cache = event_image_shape + return self._img_shape_cache @property - @functools.lru_cache() def shape(self): - return len(self), self._img_shape[0], self._img_shape[1] + if self._shape is None: + self._shape = len(self), self._img_shape[0], self._img_shape[1] + return self._shape