diff --git a/cubed/core/ops.py b/cubed/core/ops.py index cb8610f3..37202559 100644 --- a/cubed/core/ops.py +++ b/cubed/core/ops.py @@ -775,6 +775,28 @@ def map_selection( max_num_input_blocks, **kwargs, ) -> "Array": + """ + Apply a function to selected subsets of an input array using standard NumPy indexing notation. + + Parameters + ---------- + func : callable + Function to apply to every block to produce the output array. + Must accept ``block_id`` as a keyword argument (with same meaning as for ``map_blocks``). + selection_function : callable + A function that maps an output chunk key to one or more selections on the input array. + x: Array + The input array. + shape : tuple + Shape of the output array. + dtype : np.dtype + The ``dtype`` of the output array. + chunks : tuple + Chunk shape of blocks in the output array. + max_num_input_blocks : int + The maximum number of input blocks read from the input array. + """ + def key_function(out_key): # compute the selection on x required to get the relevant chunk for out_key in_sel = selection_function(out_key) @@ -1009,6 +1031,18 @@ def wrap(*a, block_id=None, **kw): def rechunk(x, chunks, target_store=None): + """Change the chunking of an array without changing its shape or data. + + Parameters + ---------- + chunks : tuple + The desired chunks of the array after rechunking. + + Returns + ------- + cubed.Array + An array with the desired chunks. + """ if isinstance(chunks, dict): chunks = {validate_axis(c, x.ndim): v for c, v in chunks.items()} for i in range(x.ndim): diff --git a/docs/design.md b/docs/design.md index c969ef95..e2748039 100644 --- a/docs/design.md +++ b/docs/design.md @@ -4,7 +4,7 @@ Cubed is composed of five layers: from the storage layer at the bottom, to the A ![Five layer diagram](images/design.svg) -Blue blocks are implemented in Cubed, green in Rechunker, and red in other projects like Zarr and Beam. +Blue blocks are implemented in Cubed; red blocks in other projects like Zarr and Lithops. Let's go through the layers from the bottom: @@ -14,7 +14,7 @@ Every _array_ in Cubed is backed by a Zarr array. This means that the array type ## Runtime -Cubed uses external runtimes for computation. It follows the Rechunker model (and uses its algorithm) to delegate tasks to stateless executors, which include Python (in-process), Lithops, Modal, and Apache Beam. +Cubed uses external runtimes for computation, delegating tasks to stateless executors, which include Python (in-process), Lithops, Modal, and Apache Beam. ## Primitive operations @@ -45,8 +45,7 @@ These are built on top of the primitive operations, and provide functions that a elemwise map_blocks - map_direct - index + map_selection reduction arg_reduction ``` diff --git a/docs/images/design.svg b/docs/images/design.svg index 68029ea4..ad0d6976 100644 --- a/docs/images/design.svg +++ b/docs/images/design.svg @@ -1 +1 @@ - + diff --git a/docs/images/map_selection.svg b/docs/images/map_selection.svg new file mode 100644 index 00000000..6421447d --- /dev/null +++ b/docs/images/map_selection.svg @@ -0,0 +1 @@ + diff --git a/docs/images/ops.dot b/docs/images/ops.dot index 9994a463..42532170 100644 --- a/docs/images/ops.dot +++ b/docs/images/ops.dot @@ -11,21 +11,20 @@ digraph { // core elemwise [style="filled"; fillcolor="#ffd8b1";]; map_blocks [style="filled"; fillcolor="#ffd8b1";]; - map_direct [style="filled"; fillcolor="#ffd8b1";]; + map_selection [style="filled"; fillcolor="#ffd8b1";]; reduction [style="filled"; fillcolor="#ffd8b1";]; arg_reduction [style="filled"; fillcolor="#ffd8b1";]; elemwise -> blockwise; map_blocks -> blockwise; - map_direct -> map_blocks; + map_selection -> blockwise; reduction -> blockwise; - reduction -> rechunk; arg_reduction -> reduction; // array API // array object - __getitem__ -> map_direct + __getitem__ -> map_selection // elementwise add -> elemwise @@ -34,12 +33,11 @@ digraph { // linear algebra matmul -> blockwise; matmul -> reduction; - outer -> blockwise; // manipulation - concat -> map_direct; + concat -> blockwise; reshape -> rechunk; - reshape -> map_direct; + reshape -> blockwise; squeeze -> map_blocks; // searching @@ -51,18 +49,17 @@ digraph { // utility all -> reduction; - { rank = min; // fix horizontal placing with invisible edges edge[style=invis]; - add -> negative -> outer -> matmul -> __getitem__ -> concat -> reshape -> squeeze -> argmax -> sum -> all; + add -> negative -> squeeze -> __getitem__ -> concat -> matmul -> sum -> all -> argmax -> reshape; rankdir = LR; } { rank = same; - elemwise; map_blocks; reduction; + elemwise; map_blocks; map_selection; reduction; } { rank = max; diff --git a/docs/images/ops.dot.svg b/docs/images/ops.dot.svg index e948db7b..0690f68b 100644 --- a/docs/images/ops.dot.svg +++ b/docs/images/ops.dot.svg @@ -1,244 +1,225 @@ - - + - + blockwise - -blockwise + +blockwise rechunk - -rechunk + +rechunk elemwise - -elemwise + +elemwise elemwise->blockwise - - + + map_blocks - -map_blocks + +map_blocks map_blocks->blockwise - - + + - + -map_direct - -map_direct +map_selection + +map_selection - + -map_direct->map_blocks - - +map_selection->blockwise + + reduction - -reduction + +reduction reduction->blockwise - - - - - -reduction->rechunk - - + + arg_reduction - -arg_reduction + +arg_reduction - + arg_reduction->reduction - - + + __getitem__ - -__getitem__ + +__getitem__ - - -__getitem__->map_direct - - + + +__getitem__->map_selection + + - + concat - -concat + +concat add -add +add - + add->elemwise - - + + negative - -negative + +negative - + negative->elemwise - - + + - - -outer - -outer + + +squeeze + +squeeze - + matmul - -matmul + +matmul - + matmul->blockwise - - + + - + matmul->reduction - - + + - - - -outer->blockwise - - + + +sum + +sum - - - -concat->map_direct - - + + + +concat->blockwise + + + - + reshape - -reshape + +reshape + + + +reshape->blockwise + + - - + reshape->rechunk - - + + - - -reshape->map_direct - - - - - -squeeze - -squeeze - - - + squeeze->map_blocks - - + + + - + argmax - -argmax + +argmax - - + argmax->arg_reduction - - - - - -sum - -sum + + - + - + sum->reduction - - + + - + all - -all + +all - + all->reduction - - + + + diff --git a/docs/operations.md b/docs/operations.md index e7b003d8..7865fdb0 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -39,18 +39,20 @@ This example shows how the `squeeze` operation is implemented using `map_blocks` ![The map_blocks core operation](images/map_blocks.svg) -## `map_direct` +## `map_selection` -The `map_direct` operation is more general than `map_blocks` since it allows input arrays to be read directly, without regard for block boundaries. +The `map_selection` operation selects subsets of an input array using standard NumPy indexing notation. * No input array attributes are preserved in general -* __Multiple__ inputs, __single__ output +* __Single__ input, __single__ output -It is a core operation that is implemented using `map_blocks`, which is in turn implemented using `blockwise`. It works by creating an empty array that has the same shape and chunk structure as the output, and calling `map_blocks` on this empty array, passing in the input arrays as side inputs to the function, which may access them in whatever way is needed. +It is a core operation that is implemented using `blockwise` on the output's blocks. It works by converting indexing selections, such as slices, to keys that refer to blocks in the input array, then retrieving these blocks, slicing them, and assembling them into the final output block. -This example shows how `concat` is implemented using `map_direct`. Each block in the output array is read directly from one or more blocks from the inputs. +This example shows how `index` is implemented using `map_selection`. Each block in the output array is read directly from one or more blocks from the inputs. -![The map_direct core operation](images/map_direct.svg) +![The map_selection core operation](images/map_selection.svg) + +Note: previously, operations that now use `map_selection` were written using `map_direct`, which allowed input arrays to be read directly. The main difference between the two operations is that `map_selection` tracks which blocks are read by the operation, which allows the optimizer to fuse operations. This is unlike `map_direct`, which does not provide information about block inputs, and therefore cannot be fused by the optimizer. ## `blockwise` @@ -63,6 +65,13 @@ This example shows how `outer` is implemented using `blockwise`. Each block from ![The blockwise primitive operation](images/blockwise.svg) +Note: the `general_blockwise` operation is a more general form of `blockwise` that uses a function to specify the block mapping, rather than an index notation, and which supports multiple outputs. + +* No input array attributes are preserved in general +* __Multiple__ inputs, __multiple__ outputs + +For multiple outputs, all output arrays must have matching `numblocks`. + (rechunk-operation)= ## `rechunk` diff --git a/docs/user-guide/optimization.md b/docs/user-guide/optimization.md index 3844d581..b8e3a458 100644 --- a/docs/user-guide/optimization.md +++ b/docs/user-guide/optimization.md @@ -81,9 +81,9 @@ e.visualize() The output explains which operations can or can't be fused, and why: ``` -DEBUG:cubed.core.optimization:can't fuse op-001 since it is not a primitive operation, or it uses map_direct -DEBUG:cubed.core.optimization:can't fuse op-002 since it is not a primitive operation, or it uses map_direct -DEBUG:cubed.core.optimization:can't fuse op-003 since it is not a primitive operation, or it uses map_direct +DEBUG:cubed.core.optimization:can't fuse op-001 since it is not a primitive operation, or it uses an operation that can't be fused (concat or stack) +DEBUG:cubed.core.optimization:can't fuse op-002 since it is not a primitive operation, or it uses an operation that can't be fused (concat or stack) +DEBUG:cubed.core.optimization:can't fuse op-003 since it is not a primitive operation, or it uses an operation that can't be fused (concat or stack) DEBUG:cubed.core.optimization:can't fuse op-004 since no predecessor ops can be fused DEBUG:cubed.primitive.blockwise:can fuse op-005 since num tasks of predecessor ops match ```