Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ refactoring: ak.unflatten #1360

Merged
merged 5 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
388 changes: 192 additions & 196 deletions src/awkward/_v2/operations/structure/ak_unflatten.py
Original file line number Diff line number Diff line change
@@ -1,204 +1,200 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

import numbers
import awkward as ak

np = ak.nplike.NumpyMetadata.instance()


def unflatten(array, counts, axis=0, highlevel=True, behavior=None):
raise ak._v2._util.error(NotImplementedError)


# """
# Args:
# array: Data to create an array with an additional level from.
# counts (int or array): Number of elements the new level should have.
# If an integer, the new level will be regularly sized; otherwise,
# it will consist of variable-length lists with the given lengths.
# axis (int): The dimension at which this operation is applied. The
# outermost dimension is `0`, followed by `1`, etc., and negative
# values count backward from the innermost: `-1` is the innermost
# dimension, `-2` is the next level up, etc.
# highlevel (bool): If True, return an #ak.Array; otherwise, return
# a low-level #ak.layout.Content subclass.
# behavior (None or dict): Custom #ak.behavior for the output array, if
# high-level.

# Returns an array with an additional level of nesting. This is roughly the
# inverse of #ak.flatten, where `counts` were obtained by #ak.num (both with
# `axis=1`).

# For example,

# >>> original = ak.Array([[0, 1, 2], [], [3, 4], [5], [6, 7, 8, 9]])
# >>> counts = ak.num(original)
# >>> array = ak.flatten(original)
# >>> counts
# <Array [3, 0, 2, 1, 4] type='5 * int64'>
# >>> array
# <Array [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] type='10 * int64'>
# >>> ak.unflatten(array, counts)
# <Array [[0, 1, 2], [], ... [5], [6, 7, 8, 9]] type='5 * var * int64'>

# An inner dimension can be unflattened by setting the `axis` parameter, but
# operations like this constrain the `counts` more tightly.

# For example, we can subdivide an already divided list:

# >>> original = ak.Array([[1, 2, 3, 4], [], [5, 6, 7], [8, 9]])
# >>> print(ak.unflatten(original, [2, 2, 1, 2, 1, 1], axis=1))
# [[[1, 2], [3, 4]], [], [[5], [6, 7]], [[8], [9]]]

# But the counts have to add up to the lengths of those lists. We can't mix
# values from the first `[1, 2, 3, 4]` with values from the next `[5, 6, 7]`.

# >>> print(ak.unflatten(original, [2, 1, 2, 2, 1, 1], axis=1))
# Traceback (most recent call last):
# ...
# ValueError: structure imposed by 'counts' does not fit in the array at axis=1

# Also note that new lists created by this function cannot cross partitions
# (which is only possible at `axis=0`, anyway).

# See also #ak.num and #ak.flatten.
# """
# nplike = ak.nplike.of(array)

# layout = ak._v2.operations.convert.to_layout(
# array, allow_record=False, allow_other=False
# )

# if isinstance(counts, (numbers.Integral, np.integer)):
# current_offsets = None
# else:
# counts = ak._v2.operations.convert.to_layout(
# counts, allow_record=False, allow_other=False
# )
# ptr_lib = ak._v2.operations.convert.kernels(array)
# counts = ak._v2.operations.convert.to_kernels(counts, ptr_lib, highlevel=False)
# if ptr_lib == "cpu":
# counts = ak._v2.operations.convert.to_numpy(counts, allow_missing=True)
# mask = ak.nplike.numpy.ma.getmask(counts)
# counts = ak.nplike.numpy.ma.filled(counts, 0)
# elif ptr_lib == "cuda":
# counts = ak._v2.operations.convert.to_cupy(counts)
# mask = False
# else:
# raise ak._v2._util.error(AssertionError(
# "unrecognized kernels lib"
# ))
# if counts.ndim != 1:
# raise ak._v2._util.error(ValueError(
# "counts must be one-dimensional"
# ))
# if not issubclass(counts.dtype.type, np.integer):
# raise ak._v2._util.error(ValueError(
# "counts must be integers"
# ))
# current_offsets = [nplike.empty(len(counts) + 1, np.int64)]
# current_offsets[0][0] = 0
# nplike.cumsum(counts, out=current_offsets[0][1:])

# def doit(layout):
# if isinstance(counts, (numbers.Integral, np.integer)):
# if counts < 0 or counts > len(layout):
# raise ak._v2._util.error(ValueError(
# "too large counts for array or negative counts"
#
# ))
# out = ak._v2.contents.RegularArray(layout, counts)

# else:
# position = (
# nplike.searchsorted(
# current_offsets[0], nplike.array([len(layout)]), side="right"
# )[0]
# - 1
# )
# if position >= len(current_offsets[0]) or current_offsets[0][
# position
# ] != len(layout):
# raise ak._v2._util.error(ValueError(
# "structure imposed by 'counts' does not fit in the array or partition "
# "at axis={0}".format(axis)
# ))

# offsets = current_offsets[0][: position + 1]
# current_offsets[0] = current_offsets[0][position:] - len(layout)

# out = ak._v2.contents.ListOffsetArray64(ak._v2.contents.Index64(offsets), layout)
# if not isinstance(mask, (bool, np.bool_)):
# index = ak._v2.index.Index8(nplike.asarray(mask).astype(np.int8))
# out = ak._v2.contents.ByteMaskedArray(index, out, valid_when=False)

# return out

# if axis == 0 or layout.axis_wrap_if_negative(axis) == 0:
# if isinstance(layout, ak.partition.PartitionedArray): # NO PARTITIONED ARRAY
# outparts = []
# for part in layout.partitions:
# outparts.append(doit(part))
# out = ak.partition.IrregularlyPartitionedArray(outparts) # NO PARTITIONED ARRAY
# else:
# out = doit(layout)

# else:

# def transform(layout, depth, posaxis):
# # Pack the current layout. This ensures that the `counts` array,
# # which is computed with these layouts applied, aligns with the
# # internal layout to be unflattened (#910)
# layout = _pack_layout(layout)

# posaxis = layout.axis_wrap_if_negative(posaxis)
# if posaxis == depth and isinstance(layout, ak._v2._util.listtypes):
# # We are one *above* the level where we want to apply this.
# listoffsetarray = layout.toListOffsetArray64(True)
# outeroffsets = nplike.asarray(listoffsetarray.offsets)

# content = doit(listoffsetarray.content[: outeroffsets[-1]])
# if isinstance(content, ak._v2.contents.ByteMaskedArray):
# inneroffsets = nplike.asarray(content.content.offsets)
# elif isinstance(content, ak._v2.contents.RegularArray):
# inneroffsets = nplike.asarray(
# content.toListOffsetArray64(True).offsets
# )
# else:
# inneroffsets = nplike.asarray(content.offsets)

# positions = (
# nplike.searchsorted(inneroffsets, outeroffsets, side="right") - 1
# )
# if not nplike.array_equal(inneroffsets[positions], outeroffsets):
# raise ak._v2._util.error(ValueError(
# "structure imposed by 'counts' does not fit in the array or partition "
# "at axis={0}".format(axis)
# ))
# positions[0] = 0

# return ak._v2.contents.ListOffsetArray64(
# ak._v2.index.Index64(positions), content
# )

# else:
# return ak._v2._util.transform_child_layouts(
# transform, layout, depth, posaxis
# )

# if isinstance(layout, ak.partition.PartitionedArray): # NO PARTITIONED ARRAY
# outparts = []
# for part in layout.partitions:
# outparts.append(transform(part, depth=1, posaxis=axis))
# out = ak.partition.IrregularlyPartitionedArray(outparts) # NO PARTITIONED ARRAY
# else:
# out = transform(layout, depth=1, posaxis=axis)

# if current_offsets is not None and not (
# len(current_offsets[0]) == 1 and current_offsets[0][0] == 0
# ):
# raise ak._v2._util.error(ValueError(
# "structure imposed by 'counts' does not fit in the array or partition "
# "at axis={0}".format(axis)
# ))

# return ak._v2._util.maybe_wrap_like(out, array, behavior, highlevel)
"""
Args:
array: Data to create an array with an additional level from.
counts (int or array): Number of elements the new level should have.
If an integer, the new level will be regularly sized; otherwise,
it will consist of variable-length lists with the given lengths.
axis (int): The dimension at which this operation is applied. The
outermost dimension is `0`, followed by `1`, etc., and negative
values count backward from the innermost: `-1` is the innermost
dimension, `-2` is the next level up, etc.
highlevel (bool): If True, return an #ak.Array; otherwise, return
a low-level #ak.layout.Content subclass.
behavior (None or dict): Custom #ak.behavior for the output array, if
high-level.

Returns an array with an additional level of nesting. This is roughly the
inverse of #ak.flatten, where `counts` were obtained by #ak.num (both with
`axis=1`).

For example,

>>> original = ak.Array([[0, 1, 2], [], [3, 4], [5], [6, 7, 8, 9]])
>>> counts = ak.num(original)
>>> array = ak.flatten(original)
>>> counts
<Array [3, 0, 2, 1, 4] type='5 * int64'>
>>> array
<Array [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] type='10 * int64'>
>>> ak.unflatten(array, counts)
<Array [[0, 1, 2], [], ... [5], [6, 7, 8, 9]] type='5 * var * int64'>

An inner dimension can be unflattened by setting the `axis` parameter, but
operations like this constrain the `counts` more tightly.

For example, we can subdivide an already divided list:

>>> original = ak.Array([[1, 2, 3, 4], [], [5, 6, 7], [8, 9]])
>>> print(ak.unflatten(original, [2, 2, 1, 2, 1, 1], axis=1))
[[[1, 2], [3, 4]], [], [[5], [6, 7]], [[8], [9]]]

But the counts have to add up to the lengths of those lists. We can't mix
values from the first `[1, 2, 3, 4]` with values from the next `[5, 6, 7]`.

>>> print(ak.unflatten(original, [2, 1, 2, 2, 1, 1], axis=1))
Traceback (most recent call last):
...
ValueError: structure imposed by 'counts' does not fit in the array at axis=1

Also note that new lists created by this function cannot cross partitions
(which is only possible at `axis=0`, anyway).

See also #ak.num and #ak.flatten.
"""
with ak._v2._util.OperationErrorContext(
"ak._v2.unflatten",
dict(
array=array,
counts=counts,
axis=axis,
highlevel=highlevel,
behavior=behavior,
),
):
return _impl(array, counts, axis, highlevel, behavior)


def _impl(array, counts, axis, highlevel, behavior):
nplike = ak.nplike.of(array)

layout = ak._v2.operations.convert.to_layout(
array, allow_record=False, allow_other=False
)

if isinstance(counts, (numbers.Integral, np.integer)):
current_offsets = None
else:
counts = ak._v2.operations.convert.to_layout(
counts, allow_record=False, allow_other=False
)

if counts.is_OptionType:
mask = counts.mask_as_bool(valid_when=False)
counts = counts.to_numpy(allow_missing=True)
counts = ak.nplike.numpy.ma.filled(counts, 0)
elif counts.is_NumpyType or counts.is_UnknownType:
counts = counts.to_numpy(allow_missing=False)
mask = False

if counts.ndim != 1:
raise ak._v2._util.error(ValueError("counts must be one-dimensional"))
if not issubclass(counts.dtype.type, np.integer):
raise ak._v2._util.error(ValueError("counts must be integers"))

current_offsets = [nplike.empty(len(counts) + 1, np.int64)]
current_offsets[0][0] = 0
nplike.cumsum(counts, out=current_offsets[0][1:])

def doit(layout):
if isinstance(counts, (numbers.Integral, np.integer)):
if counts < 0 or counts > len(layout):
raise ak._v2._util.error(
ValueError("too large counts for array or negative counts")
)
out = ak._v2.contents.RegularArray(layout, counts)

else:
position = (
nplike.searchsorted(
current_offsets[0], nplike.array([len(layout)]), side="right"
)[0]
- 1
)
if position >= len(current_offsets[0]) or current_offsets[0][
position
] != len(layout):
raise ak._v2._util.error(
ValueError(
"structure imposed by 'counts' does not fit in the array or partition "
"at axis={}".format(axis)
)
)

offsets = current_offsets[0][: position + 1]
current_offsets[0] = current_offsets[0][position:] - len(layout)

out = ak._v2.contents.ListOffsetArray(ak._v2.index.Index64(offsets), layout)
if not isinstance(mask, (bool, np.bool_)):
index = ak._v2.index.Index8(nplike.asarray(mask).astype(np.int8))
out = ak._v2.contents.ByteMaskedArray(index, out, valid_when=False)

return out

if axis == 0 or layout.axis_wrap_if_negative(axis) == 0:
out = doit(layout)

else:

def transform(layout, depth, posaxis):
# Pack the current layout. This ensures that the `counts` array,
# which is computed with these layouts applied, aligns with the
# internal layout to be unflattened (#910)
layout = layout.packed()

posaxis = layout.axis_wrap_if_negative(posaxis)
if posaxis == depth and layout.is_ListType:
# We are one *above* the level where we want to apply this.
listoffsetarray = layout.toListOffsetArray64(True)
outeroffsets = nplike.asarray(listoffsetarray.offsets)

content = doit(listoffsetarray.content[: outeroffsets[-1]])
if isinstance(content, ak._v2.contents.ByteMaskedArray):
inneroffsets = nplike.asarray(content.content.offsets)
elif isinstance(content, ak._v2.contents.RegularArray):
inneroffsets = nplike.asarray(
content.toListOffsetArray64(True).offsets
)
else:
inneroffsets = nplike.asarray(content.offsets)

positions = (
nplike.searchsorted(inneroffsets, outeroffsets, side="right") - 1
)
if not nplike.array_equal(inneroffsets[positions], outeroffsets):
raise ak._v2._util.error(
ValueError(
"structure imposed by 'counts' does not fit in the array or partition "
"at axis={}".format(axis)
)
)
positions[0] = 0

return ak._v2.contents.ListOffsetArray(
ak._v2.index.Index64(positions), content
)

else:
return layout

out = transform(layout, depth=1, posaxis=axis)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrapping this up as a transform function that is only ever used once is now superfluous, but not harmful. In v1, it was needed so that the same procedure could be applied to each partition, but now we don't do PartitionedArrays anymore.

I also don't see a strong motivation to refactor it.


if current_offsets is not None and not (
len(current_offsets[0]) == 1 and current_offsets[0][0] == 0
):
raise ak._v2._util.error(
ValueError(
"structure imposed by 'counts' does not fit in the array or partition "
"at axis={}".format(axis)
)
)

return ak._v2._util.wrap(out, behavior, highlevel)
Loading