Skip to content

Commit

Permalink
apacheGH-31621: [JS] Fix Union null bitmaps (apache#37122)
Browse files Browse the repository at this point in the history
This PR fixes `Union` null handling, and re-enables the disabled `SparseUnion` test.

The format doc [says](https://arrow.apache.org/docs/format/Columnar.html#union-layout):
> Unlike other data types, unions do not have their own validity bitmap.

Therefore we need to remove null masks from union types and allow them to delegate validity to their children.

Also fixes apache#37063 for good measure.

* Closes: apache#31621
* Closes: apache#37063
* Closes apache#24123
* Closes apache#17168

Authored-by: ptaylor <paul.e.taylor@me.com>
Signed-off-by: Dominik Moritz <domoritz@gmail.com>
  • Loading branch information
trxcllnt authored and loicalleyne committed Nov 13, 2023
1 parent cb00caa commit abe3d11
Show file tree
Hide file tree
Showing 33 changed files with 394 additions and 235 deletions.
39 changes: 16 additions & 23 deletions dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1556,18 +1556,18 @@ def generate_nested_large_offsets_case():

def generate_unions_case():
fields = [
SparseUnionField('sparse', [get_field('f1', 'int32'),
get_field('f2', 'utf8')],
SparseUnionField('sparse_1', [get_field('f1', 'int32'),
get_field('f2', 'utf8')],
type_ids=[5, 7]),
DenseUnionField('dense', [get_field('f1', 'int16'),
get_field('f2', 'binary')],
DenseUnionField('dense_1', [get_field('f1', 'int16'),
get_field('f2', 'binary')],
type_ids=[10, 20]),
SparseUnionField('sparse', [get_field('f1', 'float32', nullable=False),
get_field('f2', 'bool')],
SparseUnionField('sparse_2', [get_field('f1', 'float32', nullable=False),
get_field('f2', 'bool')],
type_ids=[5, 7], nullable=False),
DenseUnionField('dense', [get_field('f1', 'uint8', nullable=False),
get_field('f2', 'uint16'),
NullField('f3')],
DenseUnionField('dense_2', [get_field('f1', 'uint8', nullable=False),
get_field('f2', 'uint16'),
NullField('f3')],
type_ids=[42, 43, 44], nullable=False),
]

Expand Down Expand Up @@ -1669,11 +1669,9 @@ def _temp_path():
.skip_category('C#')
.skip_category('JS'),

generate_null_case([10, 0])
.skip_category('JS'), # TODO(ARROW-7900)
generate_null_case([10, 0]),

generate_null_trivial_case([0, 0])
.skip_category('JS'), # TODO(ARROW-7900)
generate_null_trivial_case([0, 0]),

generate_decimal128_case(),

Expand All @@ -1699,8 +1697,7 @@ def _temp_path():

generate_non_canonical_map_case()
.skip_category('C#')
.skip_category('Java') # TODO(ARROW-8715)
.skip_category('JS'), # TODO(ARROW-8716)
.skip_category('Java'), # TODO(ARROW-8715)

generate_nested_case(),

Expand All @@ -1711,12 +1708,10 @@ def _temp_path():
.skip_category('JS'),

generate_unions_case()
.skip_category('C#')
.skip_category('JS'),
.skip_category('C#'),

generate_custom_metadata_case()
.skip_category('C#')
.skip_category('JS'),
.skip_category('C#'),

generate_duplicate_fieldnames_case()
.skip_category('C#')
Expand All @@ -1731,8 +1726,7 @@ def _temp_path():

generate_nested_dictionary_case()
.skip_category('C#')
.skip_category('Java') # TODO(ARROW-7779)
.skip_category('JS'),
.skip_category('Java'), # TODO(ARROW-7779)

generate_run_end_encoded_case()
.skip_category('C#')
Expand All @@ -1741,8 +1735,7 @@ def _temp_path():
.skip_category('Rust'),

generate_extension_case()
.skip_category('C#')
.skip_category('JS'),
.skip_category('C#'),
]

generated_paths = []
Expand Down
74 changes: 60 additions & 14 deletions dev/archery/archery/integration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def _gold_tests(self, gold_dir):
skip = set()
if name == 'union' and prefix == '0.17.1':
skip.add("Java")
skip.add("JS")
if prefix == '1.0.0-bigendian' or prefix == '1.0.0-littleendian':
skip.add("C#")
skip.add("Java")
Expand Down Expand Up @@ -500,33 +501,78 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,


def write_js_test_json(directory):
datagen.generate_primitive_case([], name='primitive_no_batches').write(
os.path.join(directory, 'primitive-no-batches.json')
)
datagen.generate_primitive_case([17, 20], name='primitive').write(
os.path.join(directory, 'primitive.json')
)
datagen.generate_primitive_case([0, 0, 0], name='primitive_zerolength').write(
os.path.join(directory, 'primitive-empty.json')
)
# datagen.generate_primitive_large_offsets_case([17, 20]).write(
# os.path.join(directory, 'primitive-large-offsets.json')
# )
datagen.generate_null_case([10, 0]).write(
os.path.join(directory, 'null.json')
)
datagen.generate_null_trivial_case([0, 0]).write(
os.path.join(directory, 'null-trivial.json')
)
datagen.generate_decimal128_case().write(
os.path.join(directory, 'decimal128.json')
)
# datagen.generate_decimal256_case().write(
# os.path.join(directory, 'decimal256.json')
# )
datagen.generate_datetime_case().write(
os.path.join(directory, 'datetime.json')
)
# datagen.generate_duration_case().write(
# os.path.join(directory, 'duration.json')
# )
# datagen.generate_interval_case().write(
# os.path.join(directory, 'interval.json')
# )
# datagen.generate_month_day_nano_interval_case().write(
# os.path.join(directory, 'month_day_nano_interval.json')
# )
datagen.generate_map_case().write(
os.path.join(directory, 'map.json')
)
datagen.generate_non_canonical_map_case().write(
os.path.join(directory, 'non_canonical_map.json')
)
datagen.generate_nested_case().write(
os.path.join(directory, 'nested.json')
)
datagen.generate_decimal128_case().write(
os.path.join(directory, 'decimal.json')
datagen.generate_recursive_nested_case().write(
os.path.join(directory, 'recursive-nested.json')
)
datagen.generate_decimal256_case().write(
os.path.join(directory, 'decimal256.json')
# datagen.generate_nested_large_offsets_case().write(
# os.path.join(directory, 'nested-large-offsets.json')
# )
datagen.generate_unions_case().write(
os.path.join(directory, 'unions.json')
)
datagen.generate_datetime_case().write(
os.path.join(directory, 'datetime.json')
datagen.generate_custom_metadata_case().write(
os.path.join(directory, 'custom-metadata.json')
)
# datagen.generate_duplicate_fieldnames_case().write(
# os.path.join(directory, 'duplicate-fieldnames.json')
# )
datagen.generate_dictionary_case().write(
os.path.join(directory, 'dictionary.json')
)
datagen.generate_dictionary_unsigned_case().write(
os.path.join(directory, 'dictionary_unsigned.json')
os.path.join(directory, 'dictionary-unsigned.json')
)
datagen.generate_primitive_case([]).write(
os.path.join(directory, 'primitive_no_batches.json')
datagen.generate_nested_dictionary_case().write(
os.path.join(directory, 'dictionary-nested.json')
)
datagen.generate_primitive_case([7, 10]).write(
os.path.join(directory, 'primitive.json')
)
datagen.generate_primitive_case([0, 0, 0]).write(
os.path.join(directory, 'primitive-empty.json')
# datagen.generate_run_end_encoded_case().write(
# os.path.join(directory, 'run_end_encoded.json')
# )
datagen.generate_extension_case().write(
os.path.join(directory, 'extension.json')
)
8 changes: 4 additions & 4 deletions docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Float32/64 |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Decimal128 |||| |||| |
| Decimal128 |||| |||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Decimal256 |||| |||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand Down Expand Up @@ -92,7 +92,7 @@ Data Types
| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift |
| (special) | | | | | | | | |
+===================+=======+=======+=======+============+=======+=======+=======+=======+
| Dictionary || ✓ (2) ||(2) | ✓ (2) | ✓ (2) || |
| Dictionary || ✓ (2) || | ✓ (2) | ✓ (2) || |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Extension |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand Down Expand Up @@ -125,7 +125,7 @@ IPC Format
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Replacement dictionaries |||| | | || |
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Delta dictionaries | ✓ (1) | | ✓ (1) | || || |
| Delta dictionaries | ✓ (1) | | ✓ (1) | || || |
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Tensors || | | | | | | |
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand All @@ -135,7 +135,7 @@ IPC Format
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Endianness conversion | ✓ (2) | | ✓ (2) | | | | | |
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Custom schema metadata |||| |||| |
| Custom schema metadata |||| |||| |
+-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+

Notes:
Expand Down
6 changes: 3 additions & 3 deletions js/.eslintrc.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ module.exports = {
"unicorn/text-encoding-identifier-case": "off",
"unicorn/prefer-top-level-await": "off",

"unicorn/consistent-destructuring": "warn",
"unicorn/no-array-reduce": ["warn", { "allowSimpleOperations": true }],
"unicorn/no-await-expression-member": "warn",
"unicorn/consistent-destructuring": "off",
"unicorn/no-array-reduce": "off",
"unicorn/no-await-expression-member": "off",
"unicorn/no-useless-undefined": "warn",
"unicorn/consistent-function-scoping": "warn",
"unicorn/prefer-math-trunc": "warn",
Expand Down
37 changes: 26 additions & 11 deletions js/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"type": "node",
"request": "launch",
"name": "Debug Gulp Build",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/node_modules/gulp/bin/gulp.js",
"args": [
"build",
Expand All @@ -71,7 +72,7 @@
"type": "node",
"request": "launch",
"name": "Debug Unit Tests",
"cwd": "${workspaceRoot}",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"program": "${workspaceFolder}/node_modules/.bin/jest",
"skipFiles": [
Expand All @@ -96,47 +97,58 @@
"type": "node",
"request": "launch",
"name": "Debug Integration Tests",
"cwd": "${workspaceRoot}",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/bin/integration.js",
"skipFiles": [
"<node_internals>/**/*.js",
"${workspaceFolder}/node_modules/**/*.js"
],
"env": {
"NODE_NO_WARNINGS": "1",
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
},
"runtimeArgs": [
"-r",
"ts-node/register"
],
"args": [
"--mode",
"VALIDATE"
"VALIDATE",
"-j", "test/data/json/unions.json",
"-a", "./test/data/cpp/stream/struct_example.arrow"
]
},
{
"type": "node",
"request": "launch",
"name": "Debug Bundle",
"cwd": "${workspaceFolder}",
"program": "${input:BUNDLE_FILE}",
"request": "launch",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
]
},
{
"type": "node",
"request": "launch",
"name": "Debug Benchmarks",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/perf/index.ts",
"request": "launch",
"skipFiles": [
"<node_internals>/**",
"${workspaceFolder}/node_modules/**/*.js"
],
"runtimeArgs": [
"--loader",
"ts-node/esm/transpile-only"
],
"type": "node"
]
},
{
"type": "node",
"request": "launch",
"name": "Debug bin/arrow2csv",
"cwd": "${workspaceFolder}",
"env": {
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
Expand All @@ -160,6 +172,7 @@
"type": "node",
"request": "launch",
"name": "Debug bin/file-to-stream",
"cwd": "${workspaceFolder}",
"env": {
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
Expand All @@ -182,6 +195,7 @@
"type": "node",
"request": "launch",
"name": "Debug bin/stream-to-file",
"cwd": "${workspaceFolder}",
"env": {
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
Expand All @@ -204,6 +218,7 @@
"type": "node",
"request": "launch",
"name": "Debug bin/json-to-arrow",
"cwd": "${workspaceFolder}",
"env": {
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
Expand All @@ -230,6 +245,7 @@
"type": "node",
"request": "launch",
"name": "Debug bin/print-buffer-alignment",
"cwd": "${workspaceFolder}",
"env": {
"ARROW_JS_DEBUG": "src",
"TS_NODE_CACHE": "false"
Expand All @@ -251,10 +267,9 @@
"type": "node",
"name": "vscode-jest-tests",
"request": "launch",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"internalConsoleOptions": "neverOpen",
"disableOptimisticBPs": true,
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/node_modules/.bin/jest",
"runtimeArgs": [
"--experimental-vm-modules"
Expand Down
6 changes: 5 additions & 1 deletion js/bin/arrow2csv.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ const arrow2csv = Path.join(here, `src/bin/arrow2csv.ts`);
const env = { ...process.env, TS_NODE_TRANSPILE_ONLY: `true` };

require('child_process').spawn(`node`, [
`--loader`, 'ts-node/esm/transpile-only', arrow2csv, ...process.argv.slice(2)
`-r`,
`ts-node/register`,
`--loader`,
`ts-node/esm/transpile-only`,
arrow2csv, ...process.argv.slice(2)
], { cwd: here, env, stdio: `inherit` });
6 changes: 3 additions & 3 deletions js/bin/integration.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@

const fs = require('fs');
const Path = require('path');
const { promisify } = require('util');
const glob = promisify(require('glob').glob);
const { glob } = require('glob');
const { zip } = require('ix/iterable/zip');
const { parse: bignumJSONParse } = require('json-bignum');
const argv = require(`command-line-args`)(cliOpts(), { partial: true });
const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : '.cjs';
const {
Table,
RecordBatchReader,
RecordBatchStreamWriter,
util: { createElementComparator }
} = require('../targets/apache-arrow/');
} = require(`../index${extension}`);

const exists = async (p) => {
try {
Expand Down
Loading

0 comments on commit abe3d11

Please sign in to comment.