Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start applying ruff rules to notebooks (ruff 0.6+) #2115

Draft
wants to merge 10 commits into
base: v3
Choose a base branch
from
20 changes: 12 additions & 8 deletions notebooks/advanced_indexing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@
}
],
"source": [
"import cProfile\n",
"import sys\n",
"\n",
"import numpy as np\n",
"\n",
"sys.path.insert(0, '..')\n",
"\n",
"import zarr\n",
"import numpy as np\n",
"\n",
"np.random.seed(42)\n",
"import cProfile\n",
"\n",
"zarr.__version__"
]
},
Expand Down Expand Up @@ -975,7 +980,7 @@
"source": [
"a = np.array([(b'aaa', 1, 4.2),\n",
" (b'bbb', 2, 8.4),\n",
" (b'ccc', 3, 12.6)], \n",
" (b'ccc', 3, 12.6)],\n",
" dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])\n",
"za = zarr.array(a, chunks=2, fill_value=None)\n",
"za[:]"
Expand Down Expand Up @@ -1437,9 +1442,9 @@
"metadata": {},
"outputs": [],
"source": [
"import tempfile\n",
"import cProfile\n",
"import pstats\n",
"import tempfile\n",
"\n",
"\n",
"def profile(statement, sort='time', restrictions=(7,)):\n",
" with tempfile.NamedTemporaryFile() as f:\n",
Expand Down Expand Up @@ -2637,8 +2642,7 @@
"metadata": {},
"outputs": [],
"source": [
"import h5py\n",
"import tempfile"
"import h5py"
]
},
{
Expand Down Expand Up @@ -2733,7 +2737,7 @@
"metadata": {},
"outputs": [],
"source": [
"# # this is pathological, takes minutes \n",
"# # this is pathological, takes minutes\n",
"# %time hc[ix_dense_bool]"
]
},
Expand Down
7 changes: 6 additions & 1 deletion notebooks/blosc_microbench.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
],
"source": [
"import numpy as np\n",
"\n",
"import zarr\n",
"\n",
"zarr.__version__"
]
},
Expand Down Expand Up @@ -101,10 +103,13 @@
}
],
"source": [
"import numpy as np\n",
"import sys\n",
"\n",
"import numpy as np\n",
"\n",
"sys.path.insert(0, '..')\n",
"import zarr\n",
"\n",
"zarr.__version__"
]
},
Expand Down
12 changes: 8 additions & 4 deletions notebooks/dask_2d_subset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@
}
],
"source": [
"import zarr; print('zarr', zarr.__version__)\n",
"import dask; print('dask', dask.__version__)\n",
"import dask.array as da\n",
"import numpy as np"
"import dask\n",
"import numpy as np\n",
"\n",
"import zarr\n",
"\n",
"print('zarr', zarr.__version__)\n",
"print('dask', dask.__version__)\n"
]
},
{
Expand Down Expand Up @@ -367,6 +370,7 @@
"source": [
"# what's taking so long?\n",
"import cProfile\n",
"\n",
"cProfile.run('gd[dim0_condition][:, dim1_indices]', sort='time')"
]
},
Expand Down
61 changes: 31 additions & 30 deletions notebooks/dask_copy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -133,25 +133,27 @@
}
],
"source": [
"import multiprocessing\n",
"import sys\n",
"\n",
"import bcolz\n",
"import dask.array as da\n",
"import h5py\n",
"import numpy as np\n",
"from bokeh.io import output_notebook\n",
"from dask.diagnostics import Profiler, ResourceProfiler\n",
"from dask.diagnostics.profile_visualize import visualize\n",
"\n",
"sys.path.insert(0, '..')\n",
"\n",
"import zarr\n",
"\n",
"print('zarr', zarr.__version__)\n",
"from zarr import blosc\n",
"import numpy as np\n",
"import h5py\n",
"import bcolz\n",
"# don't let bcolz use multiple threads internally, we want to \n",
"\n",
"# don't let bcolz use multiple threads internally, we want to\n",
"# see whether dask can make good use of multiple CPUs\n",
"bcolz.set_nthreads(1)\n",
"import multiprocessing\n",
"import dask\n",
"import dask.array as da\n",
"from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler\n",
"from dask.diagnostics.profile_visualize import visualize\n",
"from cachey import nbytes\n",
"import bokeh\n",
"from bokeh.io import output_notebook\n",
"\n",
"output_notebook()"
]
},
Expand All @@ -163,9 +165,10 @@
},
"outputs": [],
"source": [
"import tempfile\n",
"import operator\n",
"import tempfile\n",
"from functools import reduce\n",
"\n",
"from zarr.util import human_readable_size\n",
"\n",
"\n",
Expand All @@ -188,23 +191,22 @@
"\n",
"def h5d_diagnostics(d):\n",
" \"\"\"Print some diagnostics on an HDF5 dataset.\"\"\"\n",
" \n",
"\n",
" print(d)\n",
" nbytes = reduce(operator.mul, d.shape) * d.dtype.itemsize\n",
" cbytes = d._id.get_storage_size()\n",
" if cbytes > 0:\n",
" ratio = nbytes / cbytes\n",
" else:\n",
" ratio = np.inf\n",
" r = ' compression: %s' % d.compression\n",
" r += '; compression_opts: %s' % d.compression_opts\n",
" r += '; shuffle: %s' % d.shuffle\n",
" r += '\\n nbytes: %s' % human_readable_size(nbytes)\n",
" r += '; nbytes_stored: %s' % human_readable_size(cbytes)\n",
" r += '; ratio: %.1f' % ratio\n",
" r += '; chunks: %s' % str(d.chunks)\n",
" print(r)\n",
" "
" r = f' compression: {d.compression}'\n",
" r += f'; compression_opts: {d.compression_opts}'\n",
" r += f'; shuffle: {d.shuffle}'\n",
" r += f'\\n nbytes: {human_readable_size(nbytes)}'\n",
" r += f'; nbytes_stored: {human_readable_size(cbytes)}'\n",
" r += f'; ratio: {ratio:.1f}'\n",
" r += f'; chunks: {d.chunks}'\n",
" print(r)\n"
]
},
{
Expand All @@ -219,8 +221,7 @@
" dsrc = da.from_array(src, chunks=chunks)\n",
" with Profiler() as prof, ResourceProfiler(dt=dt) as rprof:\n",
" da.store(dsrc, dst, num_workers=num_workers, lock=lock)\n",
" visualize([prof, rprof], min_border_top=60, min_border_bottom=60)\n",
" "
" visualize([prof, rprof], min_border_top=60, min_border_bottom=60)\n"
]
},
{
Expand Down Expand Up @@ -567,7 +568,7 @@
}
],
"source": [
"z1 = zarr.array(a1, chunks=chunks, compression='blosc', \n",
"z1 = zarr.array(a1, chunks=chunks, compression='blosc',\n",
" compression_opts=dict(cname='lz4', clevel=1, shuffle=2))\n",
"z1"
]
Expand Down Expand Up @@ -934,8 +935,8 @@
}
],
"source": [
"h2 = h5f.create_dataset('h2', shape=h1.shape, chunks=h1.chunks, \n",
" compression=h1.compression, compression_opts=h1.compression_opts, \n",
"h2 = h5f.create_dataset('h2', shape=h1.shape, chunks=h1.chunks,\n",
" compression=h1.compression, compression_opts=h1.compression_opts,\n",
" shuffle=h1.shuffle)\n",
"h5d_diagnostics(h2)"
]
Expand Down Expand Up @@ -1218,7 +1219,7 @@
}
],
"source": [
"c2 = bcolz.zeros(a1.shape, chunklen=chunks[0], dtype=a1.dtype, \n",
"c2 = bcolz.zeros(a1.shape, chunklen=chunks[0], dtype=a1.dtype,\n",
" cparams=bcolz.cparams(cname='lz4', clevel=1, shuffle=2))\n",
"c2"
]
Expand Down
25 changes: 10 additions & 15 deletions notebooks/dask_count_alleles.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -127,24 +127,19 @@
],
"source": [
"import sys\n",
"\n",
"import allel\n",
"import h5py\n",
"from bokeh.io import output_notebook\n",
"from dask.diagnostics import Profiler, ResourceProfiler\n",
"from dask.diagnostics.profile_visualize import visualize\n",
"\n",
"sys.path.insert(0, '..')\n",
"\n",
"import zarr\n",
"\n",
"print('zarr', zarr.__version__)\n",
"from zarr import blosc\n",
"import numpy as np\n",
"import h5py\n",
"import multiprocessing\n",
"import dask\n",
"import dask.array as da\n",
"from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler\n",
"from dask.diagnostics.profile_visualize import visualize\n",
"from cachey import nbytes\n",
"import bokeh\n",
"from bokeh.io import output_notebook\n",
"output_notebook()\n",
"from functools import reduce\n",
"import operator\n",
"import allel"
"output_notebook()"
]
},
{
Expand Down
56 changes: 30 additions & 26 deletions notebooks/genotype_benchmark_compressors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,23 @@
}
],
"source": [
"import sys\n",
"sys.path.insert(0, '..')\n",
"import functools\n",
"import sys\n",
"import timeit\n",
"import zarr\n",
"print('zarr', zarr.__version__)\n",
"from zarr import blosc\n",
"print('blosc', blosc.version())\n",
"import numpy as np\n",
"\n",
"import h5py\n",
"import numpy as np\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt"
"import matplotlib.pyplot as plt\n",
"\n",
"sys.path.insert(0, '..')\n",
"\n",
"import zarr\n",
"from zarr import blosc\n",
"\n",
"print('zarr', zarr.__version__)\n",
"print('blosc', blosc.version())"
]
},
{
Expand Down Expand Up @@ -133,8 +138,8 @@
],
"source": [
"# 1M chunks of first dimension\n",
"chunks = (int(2**20 / (genotype_sample.shape[1] * genotype_sample.shape[2])), \n",
" genotype_sample.shape[1], \n",
"chunks = (int(2**20 / (genotype_sample.shape[1] * genotype_sample.shape[2])),\n",
" genotype_sample.shape[1],\n",
" genotype_sample.shape[2])\n",
"chunks"
]
Expand Down Expand Up @@ -280,11 +285,11 @@
},
"outputs": [],
"source": [
"@functools.lru_cache(maxsize=None)\n",
"@functools.cache\n",
"def compression_ratios():\n",
" x = list()\n",
" for compression, compression_opts in compression_configs:\n",
" z = zarr.array(genotype_sample, chunks=chunks, compression=compression, \n",
" z = zarr.array(genotype_sample, chunks=chunks, compression=compression,\n",
" compression_opts=compression_opts)\n",
" ratio = z.nbytes / z.nbytes_stored\n",
" x.append(ratio)\n",
Expand All @@ -311,8 +316,8 @@
}
],
"source": [
"ratios = compression_ratios() \n",
"labels = ['%s - %s' % (c, o)\n",
"ratios = compression_ratios()\n",
"labels = [f'{c} - {o}'\n",
" for c, o in compression_configs]\n",
"\n",
"fig = plt.figure(figsize=(12, len(compression_configs)*.3))\n",
Expand Down Expand Up @@ -347,34 +352,33 @@
},
"outputs": [],
"source": [
"@functools.lru_cache(maxsize=None)\n",
"@functools.cache\n",
"def compression_decompression_times(repeat=3, number=1):\n",
" c = list()\n",
" d = list()\n",
" for compression, compression_opts in compression_configs:\n",
" \n",
"\n",
" def compress():\n",
" zarr.array(genotype_sample, chunks=chunks, compression=compression, \n",
" zarr.array(genotype_sample, chunks=chunks, compression=compression,\n",
" compression_opts=compression_opts)\n",
" \n",
"\n",
" t = timeit.Timer(stmt=compress, globals=locals())\n",
" compress_times = t.repeat(repeat=repeat, number=number)\n",
" c.append(compress_times)\n",
" \n",
" z = zarr.array(genotype_sample, chunks=chunks, compression=compression, \n",
"\n",
" z = zarr.array(genotype_sample, chunks=chunks, compression=compression,\n",
" compression_opts=compression_opts)\n",
" \n",
"\n",
" def decompress():\n",
" z[:]\n",
" \n",
"\n",
" t = timeit.Timer(stmt=decompress, globals=locals())\n",
" decompress_times = t.repeat(repeat=repeat, number=number)\n",
" d.append(decompress_times)\n",
" \n",
"\n",
" log(compression, compression_opts, compress_times, decompress_times)\n",
" \n",
" return c, d\n",
" "
"\n",
" return c, d\n"
]
},
{
Expand Down
Loading