Skip to content

Commit

Permalink
Ensure tokenization of memmap doesn't materialize array in memory (da…
Browse files Browse the repository at this point in the history
  • Loading branch information
fjetter authored Jun 4, 2024
1 parent 05a04aa commit 8813859
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
25 changes: 25 additions & 0 deletions dask/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import decimal
import hashlib
import inspect
import os
import pathlib
import pickle
import types
Expand Down Expand Up @@ -1349,6 +1350,30 @@ def normalize_datatype(dt):
def register_numpy():
import numpy as np

@normalize_token.register(np.memmap)
def normalize_mmap(mm):
if hasattr(mm, "mode") and getattr(mm, "filename", None):
if hasattr(mm.base, "ctypes"):
offset = (
mm.ctypes._as_parameter_.value - mm.base.ctypes._as_parameter_.value
)
else:
offset = 0 # root memmap's have mmap object as base
if hasattr(
mm, "offset"
): # offset numpy used while opening, and not the offset to the beginning of file
offset += mm.offset
return (
mm.filename,
os.path.getmtime(mm.filename),
mm.dtype,
mm.shape,
mm.strides,
offset,
)
else:
return normalize_object(mm)

@normalize_token.register(np.ufunc)
def normalize_ufunc(func):
try:
Expand Down
3 changes: 1 addition & 2 deletions dask/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,7 @@ def test_tokenize_numpy_memmap():
z = check_tokenize(np.load(fn, mmap_mode="r"))

assert check_tokenize(x1) == check_tokenize(x2)
# Memory maps should behave similar to ordinary arrays
assert y == z
assert y != z

with tmpfile(".npy") as fn:
x = np.random.normal(size=(10, 10))
Expand Down

0 comments on commit 8813859

Please sign in to comment.