Skip to content

Commit

Permalink
ARROW-9278: [C++][Python] Remove validity bitmap from Union types, up…
Browse files Browse the repository at this point in the history
…date IPC read/write and integration tests

I am using the same `DataTypeLayout::AlwaysNull()` strategy for `ArrayData::buffers[0]` as with NullType and like with NullType, no validity buffer is sent or received in the IPC paths.

There are some related changes here that are a minor API breakage around the behavior of `AppendNull` and `AppendNulls` for `StructBuilder` and the union builder classes. The issue is that these functions really should take responsibility for maintaining the internal consistency of the child builders through null appends. I deleted some code in several places where this detail leaked outside of these functions. While there is some risk of breaking third party code (if they are using these builders), it seems ultimately like the right thing to do

Assuming the Union and Metadata V5 votes carry on the mailing list, we will want to add backwards compatibility code for reading V4 metadata, accepting unions that do not have any top-level nulls but rejecting ones that do.

Closes #7598 from wesm/ARROW-9278

Authored-by: Wes McKinney <wesm@apache.org>
Signed-off-by: Wes McKinney <wesm@apache.org>
  • Loading branch information
wesm committed Jul 2, 2020
1 parent c08d1f2 commit ef955e7
Showing 1 changed file with 14 additions and 28 deletions.
42 changes: 14 additions & 28 deletions dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,59 +911,49 @@ def _get_type(self):
def _get_children(self):
return [field.get_json() for field in self.fields]

def _make_type_ids(self, is_valid):
type_ids = np.random.choice(self.type_ids, len(is_valid))
# Mark 0 for null entries (mimics C++ UnionBuilder behaviour)
return np.choose(is_valid, [0, type_ids])
def _make_type_ids(self, size):
return np.random.choice(self.type_ids, size)


class SparseUnionField(_BaseUnionField):
mode = 'SPARSE'

def generate_column(self, size, name=None):
is_valid = self._make_is_valid(size)

array_type_ids = self._make_type_ids(is_valid)
array_type_ids = self._make_type_ids(size)
field_values = [field.generate_column(size) for field in self.fields]

if name is None:
name = self.name
return SparseUnionColumn(name, size, is_valid, array_type_ids,
field_values)
return SparseUnionColumn(name, size, array_type_ids, field_values)


class DenseUnionField(_BaseUnionField):
mode = 'DENSE'

def generate_column(self, size, name=None):
is_valid = self._make_is_valid(size)

# Reverse mapping {logical type id => physical child id}
child_ids = [None] * (max(self.type_ids) + 1)
for i, type_id in enumerate(self.type_ids):
child_ids[type_id] = i

array_type_ids = self._make_type_ids(is_valid)
array_type_ids = self._make_type_ids(size)
offsets = []
child_sizes = [0] * len(self.fields)

for i in range(size):
if is_valid[i]:
child_id = child_ids[array_type_ids[i]]
offset = child_sizes[child_id]
offsets.append(offset)
child_sizes[child_id] = offset + 1
else:
offsets.append(0)
child_id = child_ids[array_type_ids[i]]
offset = child_sizes[child_id]
offsets.append(offset)
child_sizes[child_id] = offset + 1

field_values = [
field.generate_column(child_size)
for field, child_size in zip(self.fields, child_sizes)]

if name is None:
name = self.name
return DenseUnionColumn(name, size, is_valid, array_type_ids,
offsets, field_values)
return DenseUnionColumn(name, size, array_type_ids, offsets,
field_values)


class Dictionary(object):
Expand Down Expand Up @@ -1065,16 +1055,14 @@ def _get_children(self):

class SparseUnionColumn(Column):

def __init__(self, name, count, is_valid, type_ids, field_values):
def __init__(self, name, count, type_ids, field_values):
super().__init__(name, count)
self.is_valid = is_valid
self.type_ids = type_ids
self.field_values = field_values

def _get_buffers(self):
return [
('VALIDITY', [int(v) for v in self.is_valid]),
('TYPE_ID', [int(v) for v in self.type_ids]),
('TYPE_ID', [int(v) for v in self.type_ids])
]

def _get_children(self):
Expand All @@ -1083,16 +1071,14 @@ def _get_children(self):

class DenseUnionColumn(Column):

def __init__(self, name, count, is_valid, type_ids, offsets, field_values):
def __init__(self, name, count, type_ids, offsets, field_values):
super().__init__(name, count)
self.is_valid = is_valid
self.type_ids = type_ids
self.offsets = offsets
self.field_values = field_values

def _get_buffers(self):
return [
('VALIDITY', [int(v) for v in self.is_valid]),
('TYPE_ID', [int(v) for v in self.type_ids]),
('OFFSET', [int(v) for v in self.offsets]),
]
Expand Down

0 comments on commit ef955e7

Please sign in to comment.