From 064612159151eab1a21c9b183751f7dde1b3a8ed Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 14 Apr 2020 18:26:00 +0200
Subject: [PATCH 1/2] ARROW-590: [Integration][C++] Implement union types

Implement integration tests for sparse and dense unions.
Enable them for C++ (only).
---
 dev/archery/archery/integration/datagen.py | 229 +++++++++++++++++----
 docs/source/format/Integration.rst         |  36 ++--
 2 files changed, 206 insertions(+), 59 deletions(-)

diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 3c15abc2ab2..8713c2eae2f 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -48,11 +48,12 @@ def get_json(self):
 
         return OrderedDict(entries)
 
-    def _make_is_valid(self, size):
+    def _make_is_valid(self, size, null_probability=0.4):
         if self.nullable:
-            return np.random.randint(0, 2, size=size)
+            return (np.random.random_sample(size) > null_probability
+                    ).astype(np.int8)
         else:
-            return np.ones(size)
+            return np.ones(size, dtype=np.int8)
 
 
 class Column(object):
@@ -95,7 +96,7 @@ def _get_children(self):
 class PrimitiveColumn(Column):
 
     def __init__(self, name, count, is_valid, values):
-        super(PrimitiveColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.values = values
 
@@ -117,8 +118,8 @@ class NullColumn(Column):
 class NullField(PrimitiveField):
 
     def __init__(self, name, metadata=[]):
-        super(NullField, self).__init__(name, nullable=True,
-                                        metadata=metadata)
+        super().__init__(name, nullable=True,
+                         metadata=metadata)
 
     def _get_type(self):
         return OrderedDict([('name', 'null')])
@@ -137,8 +138,8 @@ def __init__(self, name, is_signed, bit_width, *, nullable=True,
                  metadata=[],
                  min_value=TEST_INT_MIN,
                  max_value=TEST_INT_MAX):
-        super(IntegerField, self).__init__(name, nullable=nullable,
-                                           metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         self.is_signed = is_signed
         self.bit_width = bit_width
         self.min_value = min_value
@@ -193,7 +194,7 @@ def __init__(self, name, unit, *, nullable=True, metadata=[]):
         bit_width = 32 if unit == self.DAY else 64
 
         min_value, max_value = self._ranges[unit]
-        super(DateField, self).__init__(
+        super().__init__(
             name, True, bit_width,
             nullable=nullable, metadata=metadata,
             min_value=min_value, max_value=max_value
@@ -234,9 +235,9 @@ class TimeField(IntegerField):
     def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
-        super(TimeField, self).__init__(name, True, self.BIT_WIDTHS[unit],
-                                        nullable=nullable, metadata=metadata,
-                                        min_value=min_val, max_value=max_val)
+        super().__init__(name, True, self.BIT_WIDTHS[unit],
+                         nullable=nullable, metadata=metadata,
+                         min_value=min_val, max_value=max_val)
         self.unit = unit
 
     def _get_type(self):
@@ -262,11 +263,11 @@ class TimestampField(IntegerField):
     def __init__(self, name, unit='s', tz=None, *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
-        super(TimestampField, self).__init__(name, True, 64,
-                                             nullable=nullable,
-                                             metadata=metadata,
-                                             min_value=min_val,
-                                             max_value=max_val)
+        super().__init__(name, True, 64,
+                         nullable=nullable,
+                         metadata=metadata,
+                         min_value=min_val,
+                         max_value=max_val)
         self.unit = unit
         self.tz = tz
 
@@ -287,7 +288,7 @@ class DurationIntervalField(IntegerField):
     def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max,
-        super(DurationIntervalField, self).__init__(
+        super().__init__(
             name, True, 64,
             nullable=nullable, metadata=metadata,
             min_value=min_val, max_value=max_val)
@@ -305,7 +306,7 @@ def _get_type(self):
 class YearMonthIntervalField(IntegerField):
     def __init__(self, name, *, nullable=True, metadata=[]):
         min_val, max_val = [-10000*12, 10000*12]  # +/- 10000 years.
-        super(YearMonthIntervalField, self).__init__(
+        super().__init__(
             name, True, 32,
             nullable=nullable, metadata=metadata,
             min_value=min_val, max_value=max_val)
@@ -321,9 +322,9 @@ def _get_type(self):
 
 class DayTimeIntervalField(PrimitiveField):
     def __init__(self, name, *, nullable=True, metadata=[]):
-        super(DayTimeIntervalField, self).__init__(name,
-                                                   nullable=True,
-                                                   metadata=metadata)
+        super().__init__(name,
+                         nullable=True,
+                         metadata=metadata)
 
     @property
     def numpy_type(self):
@@ -352,9 +353,9 @@ class FloatingPointField(PrimitiveField):
 
     def __init__(self, name, bit_width, *, nullable=True,
                  metadata=[]):
-        super(FloatingPointField, self).__init__(name,
-                                                 nullable=nullable,
-                                                 metadata=metadata)
+        super().__init__(name,
+                         nullable=nullable,
+                         metadata=metadata)
 
         self.bit_width = bit_width
         self.precision = {
@@ -404,8 +405,8 @@ def decimal_range_from_precision(precision):
 class DecimalField(PrimitiveField):
     def __init__(self, name, precision, scale, bit_width=128, *,
                  nullable=True, metadata=[]):
-        super(DecimalField, self).__init__(name, nullable=True,
-                                           metadata=metadata)
+        super().__init__(name, nullable=True,
+                         metadata=metadata)
         self.precision = precision
         self.scale = scale
         self.bit_width = bit_width
@@ -434,7 +435,7 @@ def generate_column(self, size, name=None):
 class DecimalColumn(PrimitiveColumn):
 
     def __init__(self, name, count, is_valid, values, bit_width=128):
-        super(DecimalColumn, self).__init__(name, count, is_valid, values)
+        super().__init__(name, count, is_valid, values)
         self.bit_width = bit_width
 
     def _encode_value(self, x):
@@ -463,8 +464,8 @@ class FixedSizeBinaryField(PrimitiveField):
 
     def __init__(self, name, byte_width, *, nullable=True,
                  metadata=[]):
-        super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
-                                                   metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         self.byte_width = byte_width
 
     @property
@@ -668,8 +669,8 @@ class ListField(Field):
 
     def __init__(self, name, value_field, *, nullable=True,
                  metadata=[]):
-        super(ListField, self).__init__(name, nullable=nullable,
-                                        metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         self.value_field = value_field
 
     @property
@@ -747,8 +748,8 @@ class MapField(Field):
 
     def __init__(self, name, key_field, item_field, *, nullable=True,
                  metadata=[], keys_sorted=False):
-        super(MapField, self).__init__(name, nullable=nullable,
-                                       metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
 
         assert not key_field.nullable
         self.key_field = key_field
@@ -790,7 +791,7 @@ def generate_column(self, size, name=None):
 class MapColumn(Column):
 
     def __init__(self, name, count, is_valid, offsets, pairs):
-        super(MapColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.offsets = offsets
         self.pairs = pairs
@@ -809,8 +810,8 @@ class FixedSizeListField(Field):
 
     def __init__(self, name, value_field, list_size, *, nullable=True,
                  metadata=[]):
-        super(FixedSizeListField, self).__init__(name, nullable=nullable,
-                                                 metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         self.value_field = value_field
         self.list_size = list_size
 
@@ -835,7 +836,7 @@ def generate_column(self, size, name=None):
 class FixedSizeListColumn(Column):
 
     def __init__(self, name, count, is_valid, values):
-        super(FixedSizeListColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.values = values
 
@@ -852,8 +853,8 @@ class StructField(Field):
 
     def __init__(self, name, fields, *, nullable=True,
                  metadata=[]):
-        super(StructField, self).__init__(name, nullable=nullable,
-                                          metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         self.fields = fields
 
     def _get_type(self):
@@ -873,6 +874,84 @@ def generate_column(self, size, name=None):
         return StructColumn(name, size, is_valid, field_values)
 
 
+class _BaseUnionField(Field):
+
+    def __init__(self, name, fields, type_ids=None, *, nullable=True,
+                 metadata=[]):
+        super().__init__(name, nullable=nullable, metadata=metadata)
+        if type_ids is None:
+            type_ids = list(range(fields))
+        else:
+            assert len(fields) == len(type_ids)
+        self.fields = fields
+        self.type_ids = type_ids
+        assert all(x >= 0 for x in self.type_ids)
+
+    def _get_type(self):
+        return OrderedDict([
+            ('name', 'union'),
+            ('mode', self.mode),
+            ('typeIds', self.type_ids),
+        ])
+
+    def _get_children(self):
+        return [field.get_json() for field in self.fields]
+
+    def _make_type_ids(self, is_valid):
+        type_ids = np.random.choice(self.type_ids, len(is_valid))
+        # Mark 0 for null entries (mimicks C++ UnionBuilder behaviour)
+        return np.choose(is_valid, [0, type_ids])
+
+
+class SparseUnionField(_BaseUnionField):
+    mode = 'SPARSE'
+
+    def generate_column(self, size, name=None):
+        is_valid = self._make_is_valid(size)
+
+        array_type_ids = self._make_type_ids(is_valid)
+        field_values = [field.generate_column(size) for field in self.fields]
+
+        if name is None:
+            name = self.name
+        return SparseUnionColumn(name, size, is_valid, array_type_ids,
+                                 field_values)
+
+
+class DenseUnionField(_BaseUnionField):
+    mode = 'DENSE'
+
+    def generate_column(self, size, name=None):
+        is_valid = self._make_is_valid(size)
+
+        # Reverse mapping {logical type id => physical child id}
+        child_ids = [None] * (max(self.type_ids) + 1)
+        for i, type_id in enumerate(self.type_ids):
+            child_ids[type_id] = i
+
+        array_type_ids = self._make_type_ids(is_valid)
+        offsets = []
+        child_sizes = [0] * len(self.fields)
+
+        for i in range(size):
+            if is_valid[i]:
+                child_id = child_ids[array_type_ids[i]]
+                offset = child_sizes[child_id]
+                offsets.append(offset)
+                child_sizes[child_id] = offset + 1
+            else:
+                offsets.append(0)
+
+        field_values = [
+            field.generate_column(child_size)
+            for field, child_size in zip(self.fields, child_sizes)]
+
+        if name is None:
+            name = self.name
+        return DenseUnionColumn(name, size, is_valid, array_type_ids,
+                                offsets, field_values)
+
+
 class Dictionary(object):
 
     def __init__(self, id_, field, size, name=None, ordered=False):
@@ -896,8 +975,8 @@ class DictionaryField(Field):
 
     def __init__(self, name, index_field, dictionary, *, nullable=True,
                  metadata=[]):
-        super(DictionaryField, self).__init__(name, nullable=nullable,
-                                              metadata=metadata)
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
         assert index_field.name == ''
         assert isinstance(index_field, IntegerField)
         assert isinstance(dictionary, Dictionary)
@@ -928,7 +1007,7 @@ def generate_column(self, size, name=None):
 class StructColumn(Column):
 
     def __init__(self, name, count, is_valid, field_values):
-        super(StructColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.field_values = field_values
 
@@ -941,6 +1020,44 @@ def _get_children(self):
         return [field.get_json() for field in self.field_values]
 
 
+class SparseUnionColumn(Column):
+
+    def __init__(self, name, count, is_valid, type_ids, field_values):
+        super().__init__(name, count)
+        self.is_valid = is_valid
+        self.type_ids = type_ids
+        self.field_values = field_values
+
+    def _get_buffers(self):
+        return [
+            ('VALIDITY', [int(v) for v in self.is_valid]),
+            ('TYPE_ID', [int(v) for v in self.type_ids]),
+        ]
+
+    def _get_children(self):
+        return [field.get_json() for field in self.field_values]
+
+
+class DenseUnionColumn(Column):
+
+    def __init__(self, name, count, is_valid, type_ids, offsets, field_values):
+        super().__init__(name, count)
+        self.is_valid = is_valid
+        self.type_ids = type_ids
+        self.offsets = offsets
+        self.field_values = field_values
+
+    def _get_buffers(self):
+        return [
+            ('VALIDITY', [int(v) for v in self.is_valid]),
+            ('TYPE_ID', [int(v) for v in self.type_ids]),
+            ('OFFSET', [int(v) for v in self.offsets]),
+        ]
+
+    def _get_children(self):
+        return [field.get_json() for field in self.field_values]
+
+
 class RecordBatch(object):
 
     def __init__(self, count, columns):
@@ -1223,6 +1340,27 @@ def generate_nested_large_offsets_case():
     return _generate_file("nested_large_offsets", fields, batch_sizes)
 
 
+def generate_unions_case():
+    fields = [
+        SparseUnionField('sparse', [get_field('f1', 'int32'),
+                                    get_field('f2', 'utf8')],
+                         type_ids=[5, 7]),
+        DenseUnionField('dense', [get_field('f1', 'int16'),
+                                  get_field('f2', 'binary')],
+                        type_ids=[10, 20]),
+        SparseUnionField('sparse', [get_field('f1', 'float32', nullable=False),
+                                    get_field('f2', 'bool')],
+                         type_ids=[5, 7], nullable=False),
+        DenseUnionField('dense', [get_field('f1', 'uint8', nullable=False),
+                                  get_field('f2', 'uint16'),
+                                  NullField('f3')],
+                        type_ids=[42, 43, 44], nullable=False),
+    ]
+
+    batch_sizes = [0, 11]
+    return _generate_file("union", fields, batch_sizes)
+
+
 def generate_dictionary_case():
     dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0')
     dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
@@ -1308,6 +1446,11 @@ def _temp_path():
         .skip_category('Java')  # TODO(ARROW-6111)
         .skip_category('JS'),
 
+        generate_unions_case()
+        .skip_category('Go')
+        .skip_category('Java')  # TODO(ARROW-1692)
+        .skip_category('JS'),
+
         generate_custom_metadata_case().skip_category('Go')
                                        .skip_category('Java')
                                        .skip_category('JS'),
diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst
index 04161b9cb56..8ba306a2664 100644
--- a/docs/source/format/Integration.rst
+++ b/docs/source/format/Integration.rst
@@ -256,7 +256,7 @@ Union: ::
 
     {
       "name" : "union",
-      "mode" : "Sparse|Dense",
+      "mode" : "SPARSE|DENSE",
       "typeIds" : [ /* integer */ ]
     }
 
@@ -362,7 +362,7 @@ For ``FieldData`` inside of a ``DictionaryBatch``, the "name" field does not
 correspond to anything.
 
 Here ``$BUFFER_TYPE`` is one of ``VALIDITY``, ``OFFSET`` (for
-variable-length types, such as strings and lists), ``TYPE`` (for unions),
+variable-length types, such as strings and lists), ``TYPE_ID`` (for unions),
 or ``DATA``.
 
 ``BufferData`` is encoded based on the type of buffer:
@@ -371,27 +371,31 @@ or ``DATA``.
   ``Field`` still has a ``VALIDITY`` array, even though all values are 1.
 * ``OFFSET``: a JSON array of integers for 32-bit offsets or
   string-formatted integers for 64-bit offsets
-* ``TYPE``: a JSON array of integers
+* ``TYPE_ID``: a JSON array of integers
 * ``DATA``: a JSON array of encoded values
 
 The value encoding for ``DATA`` is different depending on the logical
 type:
 
-* For boolean type: an array of 1 (true) and 0 (false)
-* For integer-based types (including timestamps): an array of integers
-* For 64-bit integers: an array of integers formatted as JSON strings
-  to avoid loss of precision
-* For floating point types: as is. Values are limited to 3 decimal places to
-  avoid loss of precision
-* For Binary types, a hex-string is produced to encode a variable- or
-  fixed-size binary value
-
-For "list" type, ``BufferData`` has ``VALIDITY`` and ``OFFSET``, and the
-rest of the data is inside "children". These child ``FieldData`` contain all
-of the same attributes as non-child data, so in the example of a list of
-``int32``, the child data has ``VALIDITY`` and ``DATA``.
+* For boolean type: an array of 1 (true) and 0 (false).
+* For integer-based types (including timestamps): an array of JSON numbers.
+* For 64-bit integers: an array of integers formatted as JSON strings,
+  so as to avoid loss of precision.
+* For floating point types: an array of JSON numbers. Values are limited
+  to 3 decimal places to avoid loss of precision.
+* For binary types, an array of uppercase hex-encoded strings, so as
+  to represent arbitrary binary data.
+* For UTF-8 string types, an array of JSON strings.
+
+For "list" and "largelist" types, ``BufferData`` has ``VALIDITY`` and
+``OFFSET``, and the rest of the data is inside "children". These child
+``FieldData`` contain all of the same attributes as non-child data, so in
+the example of a list of ``int32``, the child data has ``VALIDITY`` and
+``DATA``.
+
 For "fixedsizelist", there is no ``OFFSET`` member because the offsets are
 implied by the field's "listSize".
+
 Note that the "count" for these child data may not match the parent "count".
 For example, if a ``RecordBatch`` has 7 rows and contains a ``FixedSizeList``
 of ``listSize`` 4, then the data inside the "children" of that ``FieldData``

From aebb4914cb001811f8a187dd4b7ba3a4066bfd7a Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 15 Apr 2020 20:43:39 +0200
Subject: [PATCH 2/2] Update dev/archery/archery/integration/datagen.py

Co-Authored-By: Benjamin Kietzman <bengilgit@gmail.com>
---
 dev/archery/archery/integration/datagen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 8713c2eae2f..6e01bbfb344 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -899,7 +899,7 @@ def _get_children(self):
 
     def _make_type_ids(self, is_valid):
         type_ids = np.random.choice(self.type_ids, len(is_valid))
-        # Mark 0 for null entries (mimicks C++ UnionBuilder behaviour)
+        # Mark 0 for null entries (mimics C++ UnionBuilder behaviour)
         return np.choose(is_valid, [0, type_ids])