Skip to content

Commit

Permalink
Issue 214 lru cache mem leak (#240)
Browse files Browse the repository at this point in the history
* fix: remove lru_cache for instance methods
  • Loading branch information
maxschloegel authored Oct 25, 2023
1 parent 8eb38a0 commit f160a49
Show file tree
Hide file tree
Showing 12 changed files with 104 additions and 96 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
0.54.3
- fix: replace lru_cache on instance methods to fix memory leak (#214)
- fix: implement `__contains__` for DCOR logs and tables
- enh: add requests timeout for DCOR data
- enh: more caching of event size and shape for HDF5 format
Expand Down
13 changes: 10 additions & 3 deletions dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(self, identifier=None, enable_basins=True):
#: Dataset format (derived from class name)
self.format = self.__class__.__name__.split("_")[-1].lower()

# Cache attribute used for __len__()-function
self._length = None
self._polygon_filter_ids = []
# Events have the feature name as keys and contain nD ndarrays.
self._events = {}
Expand Down Expand Up @@ -135,15 +137,20 @@ def __iter__(self):
yield col

def __len__(self):
if self._length is None:
self._length = self._get_length()
return self._length

def _get_length(self):
# Try to get length from metadata.
length = self.config["experiment"].get("event count")
if length:
if length is not None:
return length
# Try to get the length from the feature sizes
keys = list(self._events.keys())
keys = list(self._events.keys()) or self.features_basin
keys.sort()
for kk in keys:
length = len(self._events[kk])
length = len(self[kk])
if length:
return length
else:
Expand Down
9 changes: 4 additions & 5 deletions dclab/rtdc_dataset/fmt_dcor/logs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class DCORLogs:
def __init__(self, api):
self.api = api
self._logs_cache = None

def __contains__(self, key):
return key in self.keys()
Expand All @@ -18,6 +16,7 @@ def keys(self):
return self._logs.keys()

@property
@functools.lru_cache()
def _logs(self):
return self.api.get(query="logs")
if self._logs_cache is None:
self._logs_cache = self.api.get(query="logs")
return self._logs_cache
30 changes: 15 additions & 15 deletions dclab/rtdc_dataset/fmt_dcor/tables.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import functools

import numpy as np


class DCORTables:
def __init__(self, api):
self.api = api
self._tables_cache = None

def __contains__(self, key):
return key in self.keys()
Expand All @@ -20,17 +19,18 @@ def keys(self):
return self._tables.keys()

@property
@functools.lru_cache()
def _tables(self):
table_data = self.api.get(query="tables")
# assemble the tables
tables = {}
for key in table_data:
columns, data = table_data[key]
ds_dt = np.dtype({'names': columns,
'formats': [np.float64] * len(columns)})
tab_data = np.asarray(data)
rec_arr = np.rec.array(tab_data, dtype=ds_dt)
tables[key] = rec_arr

return tables
if self._tables_cache is None:
table_data = self.api.get(query="tables")
# assemble the tables
tables = {}
for key in table_data:
columns, data = table_data[key]
ds_dt = np.dtype({'names': columns,
'formats': [np.float64] * len(columns)})
tab_data = np.asarray(data)
rec_arr = np.rec.array(tab_data, dtype=ds_dt)
tables[key] = rec_arr

self._tables_cache = tables
return self._tables_cache
9 changes: 0 additions & 9 deletions dclab/rtdc_dataset/fmt_hdf5/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""RT-DC hdf5 format"""
from __future__ import annotations

import functools
import json
import pathlib
from typing import Any, BinaryIO, Dict
Expand Down Expand Up @@ -118,14 +117,6 @@ def close(self):
if b._ds is not None:
b._ds.close()

@functools.lru_cache()
def __len__(self):
ec = self.h5file.get("experiment:event count")
if ec is not None:
return ec
else:
return super(RTDC_HDF5, self).__len__()

@property
def _h5(self):
warnings.warn("Access to the underlying HDF5 file is now public. "
Expand Down
19 changes: 11 additions & 8 deletions dclab/rtdc_dataset/fmt_hdf5/events.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""RT-DC hdf5 format"""
from __future__ import annotations

import functools
import numbers
import numpy as np

Expand All @@ -17,6 +16,7 @@ def __init__(self, h5group, length=None):
self.h5group = h5group
# for hashing in util.obj2bytes
self.identifier = (h5group.file.filename, h5group["0"].name)
self._length = None

def __getitem__(self, key):
if not isinstance(key, numbers.Integral):
Expand Down Expand Up @@ -56,6 +56,7 @@ def __init__(self, h5):
# datasets, we cache the wrapping classes in the `self._cached_events`
# dictionary.
self._cached_events = {}
self._defective_features = {}
self._features_list = None

@property
Expand Down Expand Up @@ -97,15 +98,17 @@ def __iter__(self):
for key in self.keys():
yield key

@functools.lru_cache()
def _is_defective_feature(self, feat):
"""Whether the stored feature is defective"""
defective = False
if feat in feat_defect.DEFECTIVE_FEATURES and feat in self._features:
# feature exists in the HDF5 file
# workaround machinery for sorting out defective features
defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file)
return defective
if feat not in self._defective_features:
defective = False
if (feat in feat_defect.DEFECTIVE_FEATURES
and feat in self._features):
# feature exists in the HDF5 file
# workaround machinery for sorting out defective features
defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file)
self._defective_features[feat] = defective
return self._defective_features[feat]

def keys(self):
"""Returns list of valid features
Expand Down
19 changes: 9 additions & 10 deletions dclab/rtdc_dataset/fmt_hdf5/logs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class H5Logs:
def __init__(self, h5):
self.h5file = h5
self._cache_keys = None

def __getitem__(self, key):
if key in self.keys():
Expand All @@ -24,11 +22,12 @@ def __iter__(self):
def __len__(self):
return len(self.keys())

@functools.lru_cache()
def keys(self):
names = []
if "logs" in self.h5file:
for key in self.h5file["logs"]:
if self.h5file["logs"][key].size:
names.append(key)
return names
if self._cache_keys is None:
names = []
if "logs" in self.h5file:
for key in self.h5file["logs"]:
if self.h5file["logs"][key].size:
names.append(key)
self._cache_keys = names
return self._cache_keys
19 changes: 9 additions & 10 deletions dclab/rtdc_dataset/fmt_hdf5/tables.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class H5Tables:
def __init__(self, h5):
self.h5file = h5
self._cache_keys = None

def __getitem__(self, key):
if key in self.keys():
Expand All @@ -21,11 +19,12 @@ def __iter__(self):
def __len__(self):
return len(self.keys())

@functools.lru_cache()
def keys(self):
names = []
if "tables" in self.h5file:
for key in self.h5file["tables"]:
if self.h5file["tables"][key].size:
names.append(key)
return names
if self._cache_keys is None:
names = []
if "tables" in self.h5file:
for key in self.h5file["tables"]:
if self.h5file["tables"][key].size:
names.append(key)
self._cache_keys = names
return self._cache_keys
10 changes: 6 additions & 4 deletions dclab/rtdc_dataset/fmt_hierarchy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""RT-DC hierarchy format"""
import collections
import functools

import numpy as np

Expand Down Expand Up @@ -310,6 +309,8 @@ def __init__(self, hparent, apply_filter=True, *args, **kwargs):
# This will also populate all event attributes
self.apply_filter()

self._length = None

def __contains__(self, key):
return self.hparent.__contains__(key)

Expand All @@ -336,9 +337,10 @@ def __getitem__(self, feat):
+ "root parent of this hierarchy child).")
return data

@functools.lru_cache()
def __len__(self):
return np.sum(self.hparent.filter.all)
if self._length is None:
self._length = np.sum(self.hparent.filter.all)
return self._length

def _assert_filter(self):
"""Make sure filters exists
Expand Down Expand Up @@ -426,7 +428,7 @@ def apply_filter(self, *args, **kwargs):
self.hparent.apply_filter(*args, **kwargs)

# Clear anything that has been cached until now
self.__len__.cache_clear()
self._length = None

# update event index
event_count = len(self)
Expand Down
23 changes: 13 additions & 10 deletions dclab/rtdc_dataset/fmt_tdms/event_contour.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Class for efficiently handling contour data"""
import functools
import numbers
import sys
import warnings
Expand Down Expand Up @@ -58,6 +57,7 @@ def __init__(self, rtdc_dataset):
self.pxfeat[key] = rtdc_dataset[key] / px_size

self.event_offset = 0
self._length = None

def __getitem__(self, idx):
if not isinstance(idx, numbers.Integral):
Expand Down Expand Up @@ -127,14 +127,15 @@ def __getitem__(self, idx):
)
return cdata

@functools.lru_cache(maxsize=1)
def __len__(self):
length = len(self._contour_data)
if length:
if not self._initialized:
self.determine_offset()
length += self.event_offset
return length
if self._length is None:
length = len(self._contour_data)
if length:
if not self._initialized:
self.determine_offset()
length += self.event_offset
self._length = length
return self._length

@property
def shape(self):
Expand Down Expand Up @@ -209,6 +210,7 @@ def __init__(self, fname):
"""
self._initialized = False
self.filename = fname
self._length = None

def __getitem__(self, idx):
cont = self.data[idx]
Expand All @@ -225,9 +227,10 @@ def __getitem__(self, idx):
data = np.fromstring(cont, sep=",", dtype=np.uint16).reshape(-1, 2)
return data

@functools.lru_cache(maxsize=1)
def __len__(self):
return len(self.data)
if self._length is None:
self._length = len(self.data)
return self._length

def _index_file(self):
"""Open and index the contour file
Expand Down
9 changes: 5 additions & 4 deletions dclab/rtdc_dataset/fmt_tdms/event_image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Class for efficiently handling image/video data
"""
import functools
import numbers
import pathlib
import sys
Expand Down Expand Up @@ -34,6 +33,7 @@ def __init__(self, rtdc_dataset):
conf = rtdc_dataset.config
self.event_offset = int(conf["fmt_tdms"]["video frame offset"])
self.video_file = fname
self._shape = None

def __getitem__(self, idx):
if not isinstance(idx, numbers.Integral):
Expand Down Expand Up @@ -79,10 +79,11 @@ def dummy(self):
return cdata

@property
@functools.lru_cache()
def shape(self):
f0 = self._image_data[0].shape
return len(self), f0[0], f0[1]
if self._shape is None:
f0 = self._image_data[0].shape
self._shape = len(self), f0[0], f0[1]
return self._shape

@staticmethod
def find_video_file(rtdc_dataset):
Expand Down
Loading

0 comments on commit f160a49

Please sign in to comment.