Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 214 lru cache mem leak #240

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
0.54.3
- fix: replace lru_cache on instance methods to fix memory leak (#214)
- fix: implement `__contains__` for DCOR logs and tables
- enh: add requests timeout for DCOR data
- enh: more caching of event size and shape for HDF5 format
Expand Down
13 changes: 10 additions & 3 deletions dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(self, identifier=None, enable_basins=True):
#: Dataset format (derived from class name)
self.format = self.__class__.__name__.split("_")[-1].lower()

# Cache attribute used for __len__()-function
self._length = None
self._polygon_filter_ids = []
# Events have the feature name as keys and contain nD ndarrays.
self._events = {}
Expand Down Expand Up @@ -135,15 +137,20 @@ def __iter__(self):
yield col

def __len__(self):
if self._length is None:
self._length = self._get_length()
return self._length

def _get_length(self):
# Try to get length from metadata.
length = self.config["experiment"].get("event count")
if length:
if length is not None:
return length
# Try to get the length from the feature sizes
keys = list(self._events.keys())
keys = list(self._events.keys()) or self.features_basin
keys.sort()
for kk in keys:
length = len(self._events[kk])
length = len(self[kk])
if length:
return length
else:
Expand Down
9 changes: 4 additions & 5 deletions dclab/rtdc_dataset/fmt_dcor/logs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class DCORLogs:
def __init__(self, api):
self.api = api
self._logs_cache = None

def __contains__(self, key):
return key in self.keys()
Expand All @@ -18,6 +16,7 @@ def keys(self):
return self._logs.keys()

@property
@functools.lru_cache()
def _logs(self):
return self.api.get(query="logs")
if self._logs_cache is None:
self._logs_cache = self.api.get(query="logs")
return self._logs_cache
30 changes: 15 additions & 15 deletions dclab/rtdc_dataset/fmt_dcor/tables.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import functools

import numpy as np


class DCORTables:
def __init__(self, api):
self.api = api
self._tables_cache = None

def __contains__(self, key):
return key in self.keys()
Expand All @@ -20,17 +19,18 @@ def keys(self):
return self._tables.keys()

@property
@functools.lru_cache()
def _tables(self):
table_data = self.api.get(query="tables")
# assemble the tables
tables = {}
for key in table_data:
columns, data = table_data[key]
ds_dt = np.dtype({'names': columns,
'formats': [np.float64] * len(columns)})
tab_data = np.asarray(data)
rec_arr = np.rec.array(tab_data, dtype=ds_dt)
tables[key] = rec_arr

return tables
if self._tables_cache is None:
table_data = self.api.get(query="tables")
# assemble the tables
tables = {}
for key in table_data:
columns, data = table_data[key]
ds_dt = np.dtype({'names': columns,
'formats': [np.float64] * len(columns)})
tab_data = np.asarray(data)
rec_arr = np.rec.array(tab_data, dtype=ds_dt)
tables[key] = rec_arr

self._tables_cache = tables
return self._tables_cache
9 changes: 0 additions & 9 deletions dclab/rtdc_dataset/fmt_hdf5/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""RT-DC hdf5 format"""
from __future__ import annotations

import functools
import json
import pathlib
from typing import Any, BinaryIO, Dict
Expand Down Expand Up @@ -118,14 +117,6 @@ def close(self):
if b._ds is not None:
b._ds.close()

@functools.lru_cache()
def __len__(self):
ec = self.h5file.get("experiment:event count")
if ec is not None:
return ec
else:
return super(RTDC_HDF5, self).__len__()

@property
def _h5(self):
warnings.warn("Access to the underlying HDF5 file is now public. "
Expand Down
19 changes: 11 additions & 8 deletions dclab/rtdc_dataset/fmt_hdf5/events.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""RT-DC hdf5 format"""
from __future__ import annotations

import functools
import numbers
import numpy as np

Expand All @@ -17,6 +16,7 @@ def __init__(self, h5group, length=None):
self.h5group = h5group
# for hashing in util.obj2bytes
self.identifier = (h5group.file.filename, h5group["0"].name)
self._length = None

def __getitem__(self, key):
if not isinstance(key, numbers.Integral):
Expand Down Expand Up @@ -56,6 +56,7 @@ def __init__(self, h5):
# datasets, we cache the wrapping classes in the `self._cached_events`
# dictionary.
self._cached_events = {}
self._defective_features = {}
self._features_list = None

@property
Expand Down Expand Up @@ -97,15 +98,17 @@ def __iter__(self):
for key in self.keys():
yield key

@functools.lru_cache()
def _is_defective_feature(self, feat):
"""Whether the stored feature is defective"""
defective = False
if feat in feat_defect.DEFECTIVE_FEATURES and feat in self._features:
# feature exists in the HDF5 file
# workaround machinery for sorting out defective features
defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file)
return defective
if feat not in self._defective_features:
defective = False
if (feat in feat_defect.DEFECTIVE_FEATURES
and feat in self._features):
# feature exists in the HDF5 file
# workaround machinery for sorting out defective features
defective = feat_defect.DEFECTIVE_FEATURES[feat](self.h5file)
self._defective_features[feat] = defective
return self._defective_features[feat]

def keys(self):
"""Returns list of valid features
Expand Down
19 changes: 9 additions & 10 deletions dclab/rtdc_dataset/fmt_hdf5/logs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class H5Logs:
def __init__(self, h5):
self.h5file = h5
self._cache_keys = None

def __getitem__(self, key):
if key in self.keys():
Expand All @@ -24,11 +22,12 @@ def __iter__(self):
def __len__(self):
return len(self.keys())

@functools.lru_cache()
def keys(self):
names = []
if "logs" in self.h5file:
for key in self.h5file["logs"]:
if self.h5file["logs"][key].size:
names.append(key)
return names
if self._cache_keys is None:
names = []
if "logs" in self.h5file:
for key in self.h5file["logs"]:
if self.h5file["logs"][key].size:
names.append(key)
self._cache_keys = names
return self._cache_keys
19 changes: 9 additions & 10 deletions dclab/rtdc_dataset/fmt_hdf5/tables.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import functools


class H5Tables:
def __init__(self, h5):
self.h5file = h5
self._cache_keys = None

def __getitem__(self, key):
if key in self.keys():
Expand All @@ -21,11 +19,12 @@ def __iter__(self):
def __len__(self):
return len(self.keys())

@functools.lru_cache()
def keys(self):
names = []
if "tables" in self.h5file:
for key in self.h5file["tables"]:
if self.h5file["tables"][key].size:
names.append(key)
return names
if self._cache_keys is None:
names = []
if "tables" in self.h5file:
for key in self.h5file["tables"]:
if self.h5file["tables"][key].size:
names.append(key)
self._cache_keys = names
return self._cache_keys
10 changes: 6 additions & 4 deletions dclab/rtdc_dataset/fmt_hierarchy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""RT-DC hierarchy format"""
import collections
import functools

import numpy as np

Expand Down Expand Up @@ -310,6 +309,8 @@ def __init__(self, hparent, apply_filter=True, *args, **kwargs):
# This will also populate all event attributes
self.apply_filter()

self._length = None

def __contains__(self, key):
return self.hparent.__contains__(key)

Expand All @@ -336,9 +337,10 @@ def __getitem__(self, feat):
+ "root parent of this hierarchy child).")
return data

@functools.lru_cache()
def __len__(self):
return np.sum(self.hparent.filter.all)
if self._length is None:
self._length = np.sum(self.hparent.filter.all)
return self._length

def _assert_filter(self):
"""Make sure filters exists
Expand Down Expand Up @@ -426,7 +428,7 @@ def apply_filter(self, *args, **kwargs):
self.hparent.apply_filter(*args, **kwargs)

# Clear anything that has been cached until now
self.__len__.cache_clear()
self._length = None

# update event index
event_count = len(self)
Expand Down
23 changes: 13 additions & 10 deletions dclab/rtdc_dataset/fmt_tdms/event_contour.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Class for efficiently handling contour data"""
import functools
import numbers
import sys
import warnings
Expand Down Expand Up @@ -58,6 +57,7 @@ def __init__(self, rtdc_dataset):
self.pxfeat[key] = rtdc_dataset[key] / px_size

self.event_offset = 0
self._length = None

def __getitem__(self, idx):
if not isinstance(idx, numbers.Integral):
Expand Down Expand Up @@ -127,14 +127,15 @@ def __getitem__(self, idx):
)
return cdata

@functools.lru_cache(maxsize=1)
def __len__(self):
length = len(self._contour_data)
if length:
if not self._initialized:
self.determine_offset()
length += self.event_offset
return length
if self._length is None:
length = len(self._contour_data)
if length:
if not self._initialized:
self.determine_offset()
length += self.event_offset
self._length = length
return self._length

@property
def shape(self):
Expand Down Expand Up @@ -209,6 +210,7 @@ def __init__(self, fname):
"""
self._initialized = False
self.filename = fname
self._length = None

def __getitem__(self, idx):
cont = self.data[idx]
Expand All @@ -225,9 +227,10 @@ def __getitem__(self, idx):
data = np.fromstring(cont, sep=",", dtype=np.uint16).reshape(-1, 2)
return data

@functools.lru_cache(maxsize=1)
def __len__(self):
return len(self.data)
if self._length is None:
self._length = len(self.data)
return self._length

def _index_file(self):
"""Open and index the contour file
Expand Down
9 changes: 5 additions & 4 deletions dclab/rtdc_dataset/fmt_tdms/event_image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Class for efficiently handling image/video data
"""
import functools
import numbers
import pathlib
import sys
Expand Down Expand Up @@ -34,6 +33,7 @@ def __init__(self, rtdc_dataset):
conf = rtdc_dataset.config
self.event_offset = int(conf["fmt_tdms"]["video frame offset"])
self.video_file = fname
self._shape = None

def __getitem__(self, idx):
if not isinstance(idx, numbers.Integral):
Expand Down Expand Up @@ -79,10 +79,11 @@ def dummy(self):
return cdata

@property
@functools.lru_cache()
def shape(self):
f0 = self._image_data[0].shape
return len(self), f0[0], f0[1]
if self._shape is None:
f0 = self._image_data[0].shape
self._shape = len(self), f0[0], f0[1]
return self._shape

@staticmethod
def find_video_file(rtdc_dataset):
Expand Down
Loading