Skip to content

Commit

Permalink
Cache Dask arrays to speed up loading files with multiple variables
Browse files Browse the repository at this point in the history
  • Loading branch information
bouweandela committed Dec 10, 2024
1 parent 0fdedb4 commit c61b12f
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 5 deletions.
51 changes: 47 additions & 4 deletions lib/iris/_lazy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def _optimum_chunksize_internals(
dim = working[0]
working = working[1:]
result.append(dim)
result = tuple(result)

return result

Expand All @@ -227,6 +228,33 @@ def _optimum_chunksize(
)


class LRUCache:
def __init__(self, maxsize: int) -> None:
self._cache: dict = {}
self.maxsize = maxsize

def __getitem__(self, key):
value = self._cache.pop(key)
self._cache[key] = value
return value

def __setitem__(self, key, value):
self._cache[key] = value
if len(self._cache) > self.maxsize:
self._cache.pop(next(iter(self._cache)))

def __contains__(self, key):
return key in self._cache

def __repr__(self):
return (
f"<{self.__class__.__name__} maxsize={self.maxsize} cache={self._cache!r} >"
)


CACHE = LRUCache(100)


def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
"""Convert the input array `data` to a :class:`dask.array.Array`.
Expand Down Expand Up @@ -264,6 +292,8 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
but reduced by a factor if that exceeds the dask default chunksize.
"""
from iris.fileformats.netcdf._thread_safe_nc import NetCDFDataProxy

if isinstance(data, ma.core.MaskedConstant):
data = ma.masked_array(data.data, mask=data.mask)

Expand All @@ -277,7 +307,7 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
if chunks is None:
# No existing chunks : Make a chunk the shape of the entire input array
# (but we will subdivide it if too big).
chunks = list(data.shape)
chunks = tuple(data.shape)

# Adjust chunk size for better dask performance,
# NOTE: but only if no shape dimension is zero, so that we can handle the
Expand All @@ -291,9 +321,22 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
dims_fixed=dims_fixed,
)

if not is_lazy_data(data):
data = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
return data
# Define a cache key for caching arrays created from NetCDFDataProxy objects.
# Creating new Dask arrays is relatively slow, therefore caching is beneficial
# if many cubes in the same file share coordinate arrays.
if isinstance(data, NetCDFDataProxy):
key = (repr(data), chunks, asarray, meta.dtype, type(meta))
else:
key = None

if is_lazy_data(data):
result = data
else:
if key not in CACHE:
CACHE[key] = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
result = CACHE[key].copy()

return result


def _co_realise_lazy_arrays(arrays):
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def test_default_chunks_limiting(self, mocker):
as_lazy_data(data)
assert limitcall_patch.call_args_list == [
mock.call(
list(test_shape),
tuple(test_shape),
shape=test_shape,
dtype=np.dtype("f4"),
dims_fixed=None,
Expand Down

0 comments on commit c61b12f

Please sign in to comment.