diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 99aeda52488db..f924c8a73cb8a 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1556,18 +1556,18 @@ def generate_nested_large_offsets_case(): def generate_unions_case(): fields = [ - SparseUnionField('sparse', [get_field('f1', 'int32'), - get_field('f2', 'utf8')], + SparseUnionField('sparse_1', [get_field('f1', 'int32'), + get_field('f2', 'utf8')], type_ids=[5, 7]), - DenseUnionField('dense', [get_field('f1', 'int16'), - get_field('f2', 'binary')], + DenseUnionField('dense_1', [get_field('f1', 'int16'), + get_field('f2', 'binary')], type_ids=[10, 20]), - SparseUnionField('sparse', [get_field('f1', 'float32', nullable=False), - get_field('f2', 'bool')], + SparseUnionField('sparse_2', [get_field('f1', 'float32', nullable=False), + get_field('f2', 'bool')], type_ids=[5, 7], nullable=False), - DenseUnionField('dense', [get_field('f1', 'uint8', nullable=False), - get_field('f2', 'uint16'), - NullField('f3')], + DenseUnionField('dense_2', [get_field('f1', 'uint8', nullable=False), + get_field('f2', 'uint16'), + NullField('f3')], type_ids=[42, 43, 44], nullable=False), ] @@ -1669,11 +1669,9 @@ def _temp_path(): .skip_category('C#') .skip_category('JS'), - generate_null_case([10, 0]) - .skip_category('JS'), # TODO(ARROW-7900) + generate_null_case([10, 0]), - generate_null_trivial_case([0, 0]) - .skip_category('JS'), # TODO(ARROW-7900) + generate_null_trivial_case([0, 0]), generate_decimal128_case(), @@ -1699,8 +1697,7 @@ def _temp_path(): generate_non_canonical_map_case() .skip_category('C#') - .skip_category('Java') # TODO(ARROW-8715) - .skip_category('JS'), # TODO(ARROW-8716) + .skip_category('Java'), # TODO(ARROW-8715) generate_nested_case(), @@ -1711,12 +1708,10 @@ def _temp_path(): .skip_category('JS'), generate_unions_case() - .skip_category('C#') - .skip_category('JS'), + .skip_category('C#'), generate_custom_metadata_case() - .skip_category('C#') - .skip_category('JS'), + .skip_category('C#'), generate_duplicate_fieldnames_case() .skip_category('C#') @@ -1731,8 +1726,7 @@ def _temp_path(): generate_nested_dictionary_case() .skip_category('C#') - .skip_category('Java') # TODO(ARROW-7779) - .skip_category('JS'), + .skip_category('Java'), # TODO(ARROW-7779) generate_run_end_encoded_case() .skip_category('C#') @@ -1741,8 +1735,7 @@ def _temp_path(): .skip_category('Rust'), generate_extension_case() - .skip_category('C#') - .skip_category('JS'), + .skip_category('C#'), ] generated_paths = [] diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 966da84e655b2..0ee9ab814e5e6 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -132,6 +132,7 @@ def _gold_tests(self, gold_dir): skip = set() if name == 'union' and prefix == '0.17.1': skip.add("Java") + skip.add("JS") if prefix == '1.0.0-bigendian' or prefix == '1.0.0-littleendian': skip.add("C#") skip.add("Java") @@ -500,33 +501,78 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, def write_js_test_json(directory): + datagen.generate_primitive_case([], name='primitive_no_batches').write( + os.path.join(directory, 'primitive-no-batches.json') + ) + datagen.generate_primitive_case([17, 20], name='primitive').write( + os.path.join(directory, 'primitive.json') + ) + datagen.generate_primitive_case([0, 0, 0], name='primitive_zerolength').write( + os.path.join(directory, 'primitive-empty.json') + ) + # datagen.generate_primitive_large_offsets_case([17, 20]).write( + # os.path.join(directory, 'primitive-large-offsets.json') + # ) + datagen.generate_null_case([10, 0]).write( + os.path.join(directory, 'null.json') + ) + datagen.generate_null_trivial_case([0, 0]).write( + os.path.join(directory, 'null-trivial.json') + ) + datagen.generate_decimal128_case().write( + os.path.join(directory, 'decimal128.json') + ) + # datagen.generate_decimal256_case().write( + # os.path.join(directory, 'decimal256.json') + # ) + datagen.generate_datetime_case().write( + os.path.join(directory, 'datetime.json') + ) + # datagen.generate_duration_case().write( + # os.path.join(directory, 'duration.json') + # ) + # datagen.generate_interval_case().write( + # os.path.join(directory, 'interval.json') + # ) + # datagen.generate_month_day_nano_interval_case().write( + # os.path.join(directory, 'month_day_nano_interval.json') + # ) datagen.generate_map_case().write( os.path.join(directory, 'map.json') ) + datagen.generate_non_canonical_map_case().write( + os.path.join(directory, 'non_canonical_map.json') + ) datagen.generate_nested_case().write( os.path.join(directory, 'nested.json') ) - datagen.generate_decimal128_case().write( - os.path.join(directory, 'decimal.json') + datagen.generate_recursive_nested_case().write( + os.path.join(directory, 'recursive-nested.json') ) - datagen.generate_decimal256_case().write( - os.path.join(directory, 'decimal256.json') + # datagen.generate_nested_large_offsets_case().write( + # os.path.join(directory, 'nested-large-offsets.json') + # ) + datagen.generate_unions_case().write( + os.path.join(directory, 'unions.json') ) - datagen.generate_datetime_case().write( - os.path.join(directory, 'datetime.json') + datagen.generate_custom_metadata_case().write( + os.path.join(directory, 'custom-metadata.json') ) + # datagen.generate_duplicate_fieldnames_case().write( + # os.path.join(directory, 'duplicate-fieldnames.json') + # ) datagen.generate_dictionary_case().write( os.path.join(directory, 'dictionary.json') ) datagen.generate_dictionary_unsigned_case().write( - os.path.join(directory, 'dictionary_unsigned.json') + os.path.join(directory, 'dictionary-unsigned.json') ) - datagen.generate_primitive_case([]).write( - os.path.join(directory, 'primitive_no_batches.json') + datagen.generate_nested_dictionary_case().write( + os.path.join(directory, 'dictionary-nested.json') ) - datagen.generate_primitive_case([7, 10]).write( - os.path.join(directory, 'primitive.json') - ) - datagen.generate_primitive_case([0, 0, 0]).write( - os.path.join(directory, 'primitive-empty.json') + # datagen.generate_run_end_encoded_case().write( + # os.path.join(directory, 'run_end_encoded.json') + # ) + datagen.generate_extension_case().write( + os.path.join(directory, 'extension.json') ) diff --git a/docs/source/status.rst b/docs/source/status.rst index e96fd1a6107a5..36c29fcdc4da6 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -44,7 +44,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Decimal128 | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ | | +| Decimal128 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Decimal256 | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -92,7 +92,7 @@ Data Types | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | (special) | | | | | | | | | +===================+=======+=======+=======+============+=======+=======+=======+=======+ -| Dictionary | ✓ | ✓ (2) | ✓ | ✓ (2) | ✓ (2) | ✓ (2) | ✓ | | +| Dictionary | ✓ | ✓ (2) | ✓ | ✓ | ✓ (2) | ✓ (2) | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -125,7 +125,7 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Replacement dictionaries | ✓ | ✓ | ✓ | | | | ✓ | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Delta dictionaries | ✓ (1) | | ✓ (1) | | ✓ | | ✓ | | +| Delta dictionaries | ✓ (1) | | ✓ (1) | ✓ | ✓ | | ✓ | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Tensors | ✓ | | | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -135,7 +135,7 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Endianness conversion | ✓ (2) | | ✓ (2) | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Custom schema metadata | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ | | +| Custom schema metadata | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ Notes: diff --git a/js/.eslintrc.cjs b/js/.eslintrc.cjs index c5cf17c59352f..b629b862190f4 100644 --- a/js/.eslintrc.cjs +++ b/js/.eslintrc.cjs @@ -102,9 +102,9 @@ module.exports = { "unicorn/text-encoding-identifier-case": "off", "unicorn/prefer-top-level-await": "off", - "unicorn/consistent-destructuring": "warn", - "unicorn/no-array-reduce": ["warn", { "allowSimpleOperations": true }], - "unicorn/no-await-expression-member": "warn", + "unicorn/consistent-destructuring": "off", + "unicorn/no-array-reduce": "off", + "unicorn/no-await-expression-member": "off", "unicorn/no-useless-undefined": "warn", "unicorn/consistent-function-scoping": "warn", "unicorn/prefer-math-trunc": "warn", diff --git a/js/.vscode/launch.json b/js/.vscode/launch.json index e09d0a39e1ff1..7d169ccb26274 100644 --- a/js/.vscode/launch.json +++ b/js/.vscode/launch.json @@ -59,6 +59,7 @@ "type": "node", "request": "launch", "name": "Debug Gulp Build", + "cwd": "${workspaceFolder}", "program": "${workspaceFolder}/node_modules/gulp/bin/gulp.js", "args": [ "build", @@ -71,7 +72,7 @@ "type": "node", "request": "launch", "name": "Debug Unit Tests", - "cwd": "${workspaceRoot}", + "cwd": "${workspaceFolder}", "console": "integratedTerminal", "program": "${workspaceFolder}/node_modules/.bin/jest", "skipFiles": [ @@ -96,7 +97,7 @@ "type": "node", "request": "launch", "name": "Debug Integration Tests", - "cwd": "${workspaceRoot}", + "cwd": "${workspaceFolder}", "program": "${workspaceFolder}/bin/integration.js", "skipFiles": [ "/**/*.js", @@ -104,25 +105,36 @@ ], "env": { "NODE_NO_WARNINGS": "1", + "ARROW_JS_DEBUG": "src", + "TS_NODE_CACHE": "false" }, + "runtimeArgs": [ + "-r", + "ts-node/register" + ], "args": [ "--mode", - "VALIDATE" + "VALIDATE", + "-j", "test/data/json/unions.json", + "-a", "./test/data/cpp/stream/struct_example.arrow" ] }, { + "type": "node", + "request": "launch", "name": "Debug Bundle", + "cwd": "${workspaceFolder}", "program": "${input:BUNDLE_FILE}", - "request": "launch", "skipFiles": [ "/**" - ], - "type": "node" + ] }, { + "type": "node", + "request": "launch", "name": "Debug Benchmarks", + "cwd": "${workspaceFolder}", "program": "${workspaceFolder}/perf/index.ts", - "request": "launch", "skipFiles": [ "/**", "${workspaceFolder}/node_modules/**/*.js" @@ -130,13 +142,13 @@ "runtimeArgs": [ "--loader", "ts-node/esm/transpile-only" - ], - "type": "node" + ] }, { "type": "node", "request": "launch", "name": "Debug bin/arrow2csv", + "cwd": "${workspaceFolder}", "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" @@ -160,6 +172,7 @@ "type": "node", "request": "launch", "name": "Debug bin/file-to-stream", + "cwd": "${workspaceFolder}", "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" @@ -182,6 +195,7 @@ "type": "node", "request": "launch", "name": "Debug bin/stream-to-file", + "cwd": "${workspaceFolder}", "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" @@ -204,6 +218,7 @@ "type": "node", "request": "launch", "name": "Debug bin/json-to-arrow", + "cwd": "${workspaceFolder}", "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" @@ -230,6 +245,7 @@ "type": "node", "request": "launch", "name": "Debug bin/print-buffer-alignment", + "cwd": "${workspaceFolder}", "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" @@ -251,10 +267,9 @@ "type": "node", "name": "vscode-jest-tests", "request": "launch", + "cwd": "${workspaceFolder}", "console": "integratedTerminal", "internalConsoleOptions": "neverOpen", - "disableOptimisticBPs": true, - "cwd": "${workspaceFolder}", "program": "${workspaceFolder}/node_modules/.bin/jest", "runtimeArgs": [ "--experimental-vm-modules" diff --git a/js/bin/arrow2csv.js b/js/bin/arrow2csv.js index 51984a7971df1..f7cb483ff4ce3 100755 --- a/js/bin/arrow2csv.js +++ b/js/bin/arrow2csv.js @@ -23,5 +23,9 @@ const arrow2csv = Path.join(here, `src/bin/arrow2csv.ts`); const env = { ...process.env, TS_NODE_TRANSPILE_ONLY: `true` }; require('child_process').spawn(`node`, [ - `--loader`, 'ts-node/esm/transpile-only', arrow2csv, ...process.argv.slice(2) + `-r`, + `ts-node/register`, + `--loader`, + `ts-node/esm/transpile-only`, + arrow2csv, ...process.argv.slice(2) ], { cwd: here, env, stdio: `inherit` }); diff --git a/js/bin/integration.js b/js/bin/integration.js index 1b7d6ce2476be..583b7a2161d7d 100755 --- a/js/bin/integration.js +++ b/js/bin/integration.js @@ -21,17 +21,17 @@ const fs = require('fs'); const Path = require('path'); -const { promisify } = require('util'); -const glob = promisify(require('glob').glob); +const { glob } = require('glob'); const { zip } = require('ix/iterable/zip'); const { parse: bignumJSONParse } = require('json-bignum'); const argv = require(`command-line-args`)(cliOpts(), { partial: true }); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : '.cjs'; const { Table, RecordBatchReader, RecordBatchStreamWriter, util: { createElementComparator } -} = require('../targets/apache-arrow/'); +} = require(`../index${extension}`); const exists = async (p) => { try { diff --git a/js/bin/json-to-arrow.js b/js/bin/json-to-arrow.js index ee7a952befddc..20442ac062358 100755 --- a/js/bin/json-to-arrow.js +++ b/js/bin/json-to-arrow.js @@ -53,10 +53,10 @@ const arrowPaths = [...(argv.arrow || [])]; })); })() -.then((x) => +x || 0, (e) => { - e && process.stderr.write(`${e}`); - return process.exitCode || 1; -}).then((code = 0) => process.exit(code)); + .then((x) => x ?? 0, (e) => { + e && process.stderr.write(`${e}`); + return process.exitCode || 1; + }).then((code = 0) => process.exit(code)); function cliOpts() { return [ diff --git a/js/bin/print-buffer-alignment.js b/js/bin/print-buffer-alignment.js index 5df83eb83f0fa..0dd46c2da1c5d 100755 --- a/js/bin/print-buffer-alignment.js +++ b/js/bin/print-buffer-alignment.js @@ -22,7 +22,9 @@ const fs = require('fs'); const path = require('path'); const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : '.cjs'; -const { VectorLoader } = require(`../targets/apache-arrow/visitor/vectorloader`); +const { VectorLoader } = process.env.ARROW_JS_DEBUG + ? require(`../src/visitor/vectorloader${extension}`) + : require(`../targets/apache-arrow/visitor/vectorloader`); const { RecordBatch, AsyncMessageReader, makeData, Struct, Schema, Field } = require(`../index${extension}`); (async () => { diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index a35cb9dd1ed1b..5d190be22d5a0 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -20,8 +20,7 @@ import path from 'path'; import { mkdirp } from 'mkdirp'; import { argv } from './argv.js'; import { promisify } from 'util'; -import { globSync } from 'glob'; -const glob = promisify(globSync); +import { glob } from 'glob'; import child_process from 'child_process'; import { memoizeTask } from './memoize-task.js'; import fs from 'fs'; @@ -36,7 +35,7 @@ import { createRequire } from 'module'; const require = createRequire(import.meta.url); -const jestArgv = [`--reporters=jest-silent-reporter`]; +const jestArgv = []; const testFiles = [ `test/unit/`, // `test/unit/bit-tests.ts`, @@ -53,6 +52,8 @@ const testFiles = [ if (argv.verbose) { jestArgv.push(`--verbose`); +} else { + jestArgv.push(`--reporters=jest-silent-reporter`); } if (targetAndModuleCombinations.length > 1) { @@ -120,9 +121,8 @@ async function createTestJSON() { await exec(`python3 -B -c '\ import sys\n\ sys.path.append("${ARROW_ARCHERY_DIR}")\n\ -sys.argv.append("--write_generated_json=${jsonFilesDir}")\n\ -from archery.cli import integration\n\ -integration()'`); +from archery.integration.runner import write_js_test_json\n\ +write_js_test_json("${jsonFilesDir}")'`); } export async function createTestData() { diff --git a/js/src/builder/union.ts b/js/src/builder/union.ts index 1e668f9423e7f..ac8a13191a549 100644 --- a/js/src/builder/union.ts +++ b/js/src/builder/union.ts @@ -47,9 +47,7 @@ export abstract class UnionBuilder extends Builder if (childTypeId === undefined) { childTypeId = this._valueToChildTypeId(this, value, index); } - if (this.setValid(index, this.isValid(value))) { - this.setValue(index, value, childTypeId); - } + this.setValue(index, value, childTypeId); return this; } diff --git a/js/src/data.ts b/js/src/data.ts index b6f53b6d0e131..dc423cdb01e1c 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -16,7 +16,7 @@ // under the License. import { Vector } from './vector.js'; -import { BufferType, Type } from './enum.js'; +import { BufferType, Type, UnionMode } from './enum.js'; import { DataType, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; @@ -56,7 +56,6 @@ export class Data { declare public readonly length: number; declare public readonly offset: number; declare public readonly stride: number; - declare public readonly nullable: boolean; declare public readonly children: Data[]; /** @@ -70,10 +69,26 @@ export class Data { declare public readonly valueOffsets: Buffers[BufferType.OFFSET]; public get typeId(): T['TType'] { return this.type.typeId; } + public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } + public get buffers() { return [this.valueOffsets, this.values, this.nullBitmap, this.typeIds] as Buffers; } + + public get nullable(): boolean { + if (this._nullCount !== 0) { + const { type } = this; + if (DataType.isSparseUnion(type)) { + return this.children.some((child) => child.nullable); + } else if (DataType.isDenseUnion(type)) { + return this.children.some((child) => child.nullable); + } + return this.nullBitmap && this.nullBitmap.byteLength > 0; + } + return true; + } + public get byteLength(): number { let byteLength = 0; const { valueOffsets, values, nullBitmap, typeIds } = this; @@ -86,7 +101,10 @@ export class Data { protected _nullCount: number | kUnknownNullCount; - public get nullCount() { + public get nullCount(): number { + if (DataType.isUnion(this.type)) { + return this.children.reduce((nullCount, child) => nullCount + child.nullCount, 0); + } let nullCount = this._nullCount; let nullBitmap: Uint8Array | undefined; if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { @@ -118,10 +136,16 @@ export class Data { (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); } } - this.nullable = this._nullCount !== 0 && this.nullBitmap && this.nullBitmap.byteLength > 0; } - public getValid(index: number) { + public getValid(index: number): boolean { + const { type } = this; + if (DataType.isUnion(type)) { + const union = (type as Union); + const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; + const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + return child.getValid(indexInChild); + } if (this.nullable && this.nullCount > 0) { const pos = this.offset + index; const val = this.nullBitmap[pos >> 3]; @@ -130,22 +154,45 @@ export class Data { return true; } - public setValid(index: number, value: boolean) { - // Don't interact w/ nullBitmap if not nullable - if (!this.nullable) { return value; } - // If no null bitmap, initialize one on the fly - if (!this.nullBitmap || this.nullBitmap.byteLength <= (index >> 3)) { - const { nullBitmap } = this._changeLengthAndBackfillNullBitmap(this.length); - Object.assign(this, { nullBitmap, _nullCount: 0 }); + public setValid(index: number, value: boolean): boolean { + let prev: boolean; + const { type } = this; + if (DataType.isUnion(type)) { + const union = (type as Union); + const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; + const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + prev = child.getValid(indexInChild); + child.setValid(indexInChild, value); + } else { + let { nullBitmap } = this; + const { offset, length } = this; + const idx = offset + index; + const mask = 1 << (idx % 8); + const byteOffset = idx >> 3; + + // If no null bitmap, initialize one on the fly + if (!nullBitmap || nullBitmap.byteLength <= byteOffset) { + nullBitmap = new Uint8Array((((offset + length) + 63) & ~63) >> 3).fill(255); + // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s + if (this.nullCount > 0) { + nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0); + } + Object.assign(this, { nullBitmap, _nullCount: -1 }); + } + + const byte = nullBitmap[byteOffset]; + + prev = (byte & mask) !== 0; + value ? + (nullBitmap[byteOffset] = byte | mask) : + (nullBitmap[byteOffset] = byte & ~mask); } - const { nullBitmap, offset } = this; - const pos = (offset + index) >> 3; - const bit = (offset + index) % 8; - const val = (nullBitmap[pos] >> bit) & 1; - // If `val` is truthy and the current bit is 0, flip it to 1 and increment `_nullCount`. - // If `val` is falsey and the current bit is 1, flip it to 0 and decrement `_nullCount`. - value ? val === 0 && ((nullBitmap[pos] |= (1 << bit)), (this._nullCount = this.nullCount + 1)) - : val === 1 && ((nullBitmap[pos] &= ~(1 << bit)), (this._nullCount = this.nullCount - 1)); + + if (prev !== !!value) { + // Update `_nullCount` if the new value is different from the old value. + this._nullCount = this.nullCount + (value ? -1 : 1); + } + return value; } @@ -228,7 +275,7 @@ class MakeDataVisitor extends Visitor { ['offset']: offset = 0, ['length']: length = 0, } = props; - return new Data(type, offset, length, 0); + return new Data(type, offset, length, length); } public visitBool(props: BoolDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; @@ -320,14 +367,13 @@ class MakeDataVisitor extends Visitor { } public visitUnion(props: UnionDataProps) { const { ['type']: type, ['offset']: offset = 0, ['children']: children = [] } = props; - const nullBitmap = toUint8Array(props['nullBitmap']); const typeIds = toArrayBufferView(type.ArrayType, props['typeIds']); - const { ['length']: length = typeIds.length, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0, } = props; + const { ['length']: length = typeIds.length, ['nullCount']: nullCount = -1, } = props; if (DataType.isSparseUnion(type)) { - return new Data(type, offset, length, nullCount, [undefined, undefined, nullBitmap, typeIds], children); + return new Data(type, offset, length, nullCount, [undefined, undefined, undefined, typeIds], children); } const valueOffsets = toInt32Array(props['valueOffsets']); - return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap, typeIds], children); + return new Data(type, offset, length, nullCount, [valueOffsets, undefined, undefined, typeIds], children); } public visitDictionary(props: DictionaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; @@ -385,8 +431,8 @@ interface ListDataProps extends DataProps_ { valueOffsets: Va interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } interface Map_DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } -interface SparseUnionDataProps extends DataProps_ { typeIds: TypeIdsBuffer; children: Data[] } -interface DenseUnionDataProps extends DataProps_ { typeIds: TypeIdsBuffer; children: Data[]; valueOffsets: ValueOffsetsBuffer } +interface SparseUnionDataProps extends DataProps_ { nullBitmap: never; typeIds: TypeIdsBuffer; children: Data[] } +interface DenseUnionDataProps extends DataProps_ { nullBitmap: never; typeIds: TypeIdsBuffer; children: Data[]; valueOffsets: ValueOffsetsBuffer } interface UnionDataProps extends DataProps_ { typeIds: TypeIdsBuffer; children: Data[]; valueOffsets?: ValueOffsetsBuffer } export type DataProps = ( @@ -413,6 +459,8 @@ export type DataProps = ( /* */ DataProps_ ); +const makeDataVisitor = new MakeDataVisitor(); + export function makeData(props: NullDataProps): Data; export function makeData(props: IntDataProps): Data; export function makeData(props: DictionaryDataProps): Data; @@ -435,5 +483,5 @@ export function makeData(props: DenseUnionDataProps): D export function makeData(props: UnionDataProps): Data; export function makeData(props: DataProps_): Data; export function makeData(props: any) { - return new MakeDataVisitor().visit(props); + return makeDataVisitor.visit(props); } diff --git a/js/src/ipc/message.ts b/js/src/ipc/message.ts index 678e6e5f26e12..3dc8625233f95 100644 --- a/js/src/ipc/message.ts +++ b/js/src/ipc/message.ts @@ -201,7 +201,7 @@ export class JSONMessageReader extends MessageReader { return (xs || []).reduce((buffers, column: any) => [ ...buffers, ...(column['VALIDITY'] && [column['VALIDITY']] || []), - ...(column['TYPE'] && [column['TYPE']] || []), + ...(column['TYPE_ID'] && [column['TYPE_ID']] || []), ...(column['OFFSET'] && [column['OFFSET']] || []), ...(column['DATA'] && [column['DATA']] || []), ...flattenDataSources(column['children']) diff --git a/js/src/ipc/metadata/file.ts b/js/src/ipc/metadata/file.ts index 036cee18e4cea..77c76286b7491 100644 --- a/js/src/ipc/metadata/file.ts +++ b/js/src/ipc/metadata/file.ts @@ -38,7 +38,7 @@ class Footer_ { public static decode(buf: ArrayBufferViewInput) { buf = new ByteBuffer(toUint8Array(buf)); const footer = _Footer.getRootAsFooter(buf); - const schema = Schema.decode(footer.schema()!); + const schema = Schema.decode(footer.schema()!, new Map(), footer.version()); return new OffHeapFooter(schema, footer) as Footer_; } @@ -63,7 +63,7 @@ class Footer_ { _Footer.startFooter(b); _Footer.addSchema(b, schemaOffset); - _Footer.addVersion(b, MetadataVersion.V4); + _Footer.addVersion(b, MetadataVersion.V5); _Footer.addRecordBatches(b, recordBatchesOffset); _Footer.addDictionaries(b, dictionaryBatchesOffset); _Footer.finishFooterBuffer(b, _Footer.endFooter(b)); @@ -77,7 +77,7 @@ class Footer_ { public get numDictionaries() { return this._dictionaryBatches.length; } constructor(public schema: Schema, - public version: MetadataVersion = MetadataVersion.V4, + public version: MetadataVersion = MetadataVersion.V5, recordBatches?: FileBlock[], dictionaryBatches?: FileBlock[]) { recordBatches && (this._recordBatches = recordBatches); dictionaryBatches && (this._dictionaryBatches = dictionaryBatches); diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index 0fc1ca0801926..e5995110f084b 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -32,7 +32,7 @@ import { TimeUnit, Precision, IntervalUnit, UnionMode, DateUnit } from '../../en export function schemaFromJSON(_schema: any, dictionaries: Map = new Map()) { return new Schema( schemaFieldsFromJSON(_schema, dictionaries), - customMetadataFromJSON(_schema['customMetadata']), + customMetadataFromJSON(_schema['metadata']), dictionaries ); } @@ -81,7 +81,7 @@ function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[ for (let i = -1, n = (xs || []).length; ++i < n;) { const column = xs[i]; column['VALIDITY'] && buffers.push(new BufferRegion(buffers.length, column['VALIDITY'].length)); - column['TYPE'] && buffers.push(new BufferRegion(buffers.length, column['TYPE'].length)); + column['TYPE_ID'] && buffers.push(new BufferRegion(buffers.length, column['TYPE_ID'].length)); column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); buffers = buffersFromJSON(column['children'], buffers); @@ -107,7 +107,7 @@ export function fieldFromJSON(_field: any, dictionaries?: Map) // If no dictionary encoding if (!dictionaries || !(dictMeta = _field['dictionary'])) { type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries)); - field = new Field(_field['name'], type, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + field = new Field(_field['name'], type, _field['nullable'], customMetadataFromJSON(_field['metadata'])); } // If dictionary encoded and the first time we've seen this dictionary id, decode // the data type and child fields, then wrap in a Dictionary type and insert the @@ -117,7 +117,7 @@ export function fieldFromJSON(_field: any, dictionaries?: Map) keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); dictionaries.set(id, type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries))); dictType = new Dictionary(type, keys, id, dictMeta['isOrdered']); - field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['metadata'])); } // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the // data type and wrap in a new Dictionary type and field. @@ -125,14 +125,14 @@ export function fieldFromJSON(_field: any, dictionaries?: Map) // a dictionary index defaults to signed 32 bit int if unspecified keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta['isOrdered']); - field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['metadata'])); } return field || null; } /** @ignore */ -function customMetadataFromJSON(_metadata?: Record) { - return new Map(Object.entries(_metadata || {})); +function customMetadataFromJSON(metadata: { key: string; value: string }[] = []) { + return new Map(metadata.map(({ key, value }) => [key, value])); } /** @ignore */ @@ -187,7 +187,9 @@ function typeFromJSON(f: any, children?: Field[]): DataType { } case 'union': { const t = f['type']; - return new Union(UnionMode[t['mode']] as any, (t['typeIds'] || []), children || []); + const [m, ...ms] = (t['mode'] + '').toLowerCase(); + const mode = (m.toUpperCase() + ms.join('')) as keyof typeof UnionMode; + return new Union(UnionMode[mode] as any, (t['typeIds'] || []), children || []); } case 'fixedsizebinary': { const t = f['type']; diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index c5a1c9c147e8a..6465d3d064720 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -68,7 +68,7 @@ export class Message { /** @nocollapse */ public static fromJSON(msg: any, headerType: T): Message { - const message = new Message(0, MetadataVersion.V4, headerType); + const message = new Message(0, MetadataVersion.V5, headerType); message._createHeader = messageHeaderFromJSON(msg, headerType); return message; } @@ -97,7 +97,7 @@ export class Message { headerOffset = DictionaryBatch.encode(b, message.header() as DictionaryBatch); } _Message.startMessage(b); - _Message.addVersion(b, MetadataVersion.V4); + _Message.addVersion(b, MetadataVersion.V5); _Message.addHeader(b, headerOffset); _Message.addHeaderType(b, message.headerType); _Message.addBodyLength(b, BigInt(message.bodyLength)); @@ -108,13 +108,13 @@ export class Message { /** @nocollapse */ public static from(header: Schema | RecordBatch | DictionaryBatch, bodyLength = 0) { if (header instanceof Schema) { - return new Message(0, MetadataVersion.V4, MessageHeader.Schema, header); + return new Message(0, MetadataVersion.V5, MessageHeader.Schema, header); } if (header instanceof RecordBatch) { - return new Message(bodyLength, MetadataVersion.V4, MessageHeader.RecordBatch, header); + return new Message(bodyLength, MetadataVersion.V5, MessageHeader.RecordBatch, header); } if (header instanceof DictionaryBatch) { - return new Message(bodyLength, MetadataVersion.V4, MessageHeader.DictionaryBatch, header); + return new Message(bodyLength, MetadataVersion.V5, MessageHeader.DictionaryBatch, header); } throw new Error(`Unrecognized Message header: ${header}`); } @@ -225,7 +225,7 @@ function messageHeaderFromJSON(message: any, type: MessageHeader) { function decodeMessageHeader(message: _Message, type: MessageHeader) { return (() => { switch (type) { - case MessageHeader.Schema: return Schema.decode(message.header(new _Schema())!); + case MessageHeader.Schema: return Schema.decode(message.header(new _Schema())!, new Map(), message.version()); case MessageHeader.RecordBatch: return RecordBatch.decode(message.header(new _RecordBatch())!, message.version()); case MessageHeader.DictionaryBatch: return DictionaryBatch.decode(message.header(new _DictionaryBatch())!, message.version()); } @@ -290,13 +290,13 @@ declare module './message' { } /** @ignore */ -function decodeSchema(_schema: _Schema, dictionaries: Map = new Map()) { +function decodeSchema(_schema: _Schema, dictionaries: Map = new Map(), version = MetadataVersion.V5) { const fields = decodeSchemaFields(_schema, dictionaries); - return new Schema(fields, decodeCustomMetadata(_schema), dictionaries); + return new Schema(fields, decodeCustomMetadata(_schema), dictionaries, version); } /** @ignore */ -function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V4) { +function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V5) { if (batch.compression() !== null) { throw new Error('Record batch compression not implemented'); } @@ -304,7 +304,7 @@ function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V4) { } /** @ignore */ -function decodeDictionaryBatch(batch: _DictionaryBatch, version = MetadataVersion.V4) { +function decodeDictionaryBatch(batch: _DictionaryBatch, version = MetadataVersion.V5) { return new DictionaryBatch(RecordBatch.decode(batch.data()!, version), batch.id(), batch.isDelta()); } diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts index 77496e799b871..b1ad5248d6158 100644 --- a/js/src/ipc/reader.ts +++ b/js/src/ipc/reader.ts @@ -371,7 +371,7 @@ abstract class RecordBatchReaderImpl implements RecordB return dictionary.memoize(); } protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { - return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries).visitMany(types); + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); } } @@ -678,7 +678,7 @@ class RecordBatchJSONReaderImpl extends RecordBatchStre super(source, dictionaries); } protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { - return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries).visitMany(types); + return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); } } diff --git a/js/src/ipc/writer.ts b/js/src/ipc/writer.ts index 5f5cc37562bb0..54b4b0249e420 100644 --- a/js/src/ipc/writer.ts +++ b/js/src/ipc/writer.ts @@ -342,7 +342,7 @@ export class RecordBatchFileWriter extends RecordBatchW protected _writeFooter(schema: Schema) { const buffer = Footer.encode(new Footer( - schema, MetadataVersion.V4, + schema, MetadataVersion.V5, this._recordBatchBlocks, this._dictionaryBlocks )); return super diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts index eed9de66e8ec2..59505a70a3c81 100644 --- a/js/src/recordbatch.ts +++ b/js/src/recordbatch.ts @@ -315,23 +315,25 @@ function ensureSameLengthData( } /** @ignore */ -function collectDictionaries(fields: Field[], children: Data[], dictionaries = new Map()): Map { - for (let i = -1, n = fields.length; ++i < n;) { - const field = fields[i]; - const type = field.type; - const data = children[i]; - if (DataType.isDictionary(type)) { - if (!dictionaries.has(type.id)) { - if (data.dictionary) { - dictionaries.set(type.id, data.dictionary); +function collectDictionaries(fields: Field[], children: readonly Data[], dictionaries = new Map()): Map { + if ((fields?.length ?? 0) > 0 && (fields?.length === children?.length)) { + for (let i = -1, n = fields.length; ++i < n;) { + const { type } = fields[i]; + const data = children[i]; + for (const next of [data, ...(data?.dictionary?.data || [])]) { + collectDictionaries(type.children, next?.children, dictionaries); + } + if (DataType.isDictionary(type)) { + const { id } = type; + if (!dictionaries.has(id)) { + if (data?.dictionary) { + dictionaries.set(id, data.dictionary); + } + } else if (dictionaries.get(id) !== data.dictionary) { + throw new Error(`Cannot create Schema containing two different dictionaries with the same Id`); } - } else if (dictionaries.get(type.id) !== data.dictionary) { - throw new Error(`Cannot create Schema containing two different dictionaries with the same Id`); } } - if (type.children && type.children.length > 0) { - collectDictionaries(type.children, data.children, dictionaries); - } } return dictionaries; } diff --git a/js/src/schema.ts b/js/src/schema.ts index 858bab915b010..a5b1dcb56665c 100644 --- a/js/src/schema.ts +++ b/js/src/schema.ts @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +import { MetadataVersion } from './enum.js'; import { DataType, TypeMap } from './type.js'; export class Schema { @@ -22,17 +23,20 @@ export class Schema { public readonly fields: Field[]; public readonly metadata: Map; public readonly dictionaries: Map; + public readonly metadataVersion: MetadataVersion; constructor( fields: Field[] = [], metadata?: Map | null, - dictionaries?: Map | null) { + dictionaries?: Map | null, + metadataVersion = MetadataVersion.V5) { this.fields = (fields || []) as Field[]; this.metadata = metadata || new Map(); if (!dictionaries) { dictionaries = generateDictionaryMap(fields); } this.dictionaries = dictionaries; + this.metadataVersion = metadataVersion; } public get [Symbol.toStringTag]() { return 'Schema'; } diff --git a/js/src/type.ts b/js/src/type.ts index e3b9c02b5e4f2..1dc90c47cbd10 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -585,10 +585,25 @@ export interface Map_ extends DataType }> { - constructor(child: Field>, keysSorted = false) { + constructor(entries: Field>, keysSorted = false) { super(); - this.children = [child]; + this.children = [entries]; this.keysSorted = keysSorted; + // ARROW-8716 + // https://github.com/apache/arrow/issues/17168 + if (entries) { + (entries as any)['name'] = 'entries'; + if ((entries as any)?.type?.children) { + const key = (entries as any)?.type?.children[0]; + if (key) { + key['name'] = 'key'; + } + const val = (entries as any)?.type?.children[1]; + if (val) { + val['name'] = 'value'; + } + } + } } public declare readonly keysSorted: boolean; public declare readonly children: Field>[]; diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index c3d90ecadab11..dd8edf11f9258 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -212,12 +212,12 @@ export function rebaseValueOffsets(offset: number, length: number, valueOffsets: // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 if (offset !== 0) { - valueOffsets = valueOffsets.slice(0, length + 1); - for (let i = -1; ++i <= length;) { + valueOffsets = valueOffsets.slice(0, length); + for (let i = -1, n = valueOffsets.length; ++i < n;) { valueOffsets[i] += offset; } } - return valueOffsets; + return valueOffsets.subarray(0, length); } /** @ignore */ diff --git a/js/src/vector.ts b/js/src/vector.ts index 48d0a1244e815..318ce06e5c3c0 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -99,8 +99,6 @@ export class Vector { } declare protected _offsets: number[] | Uint32Array; - declare protected _nullCount: number; - declare protected _byteLength: number; /** * The {@link DataType `DataType`} of this Vector. @@ -131,20 +129,14 @@ export class Vector { * The aggregate size (in bytes) of this Vector's buffers and/or child Vectors. */ public get byteLength() { - if (this._byteLength === -1) { - this._byteLength = this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0); - } - return this._byteLength; + return this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0); } /** * The number of null elements in this Vector. */ public get nullCount() { - if (this._nullCount === -1) { - this._nullCount = computeChunkNullCounts(this.data); - } - return this._nullCount; + return computeChunkNullCounts(this.data); } /** @@ -195,7 +187,10 @@ export class Vector { // @ts-ignore public indexOf(element: T['TValue'], offset?: number): number { return -1; } - public includes(element: T['TValue'], offset?: number): boolean { return this.indexOf(element, offset) > 0; } + public includes(element: T['TValue'], offset?: number): boolean { + // eslint-disable-next-line unicorn/prefer-includes + return this.indexOf(element, offset) > -1; + } /** * Get the size in bytes of an element by index. @@ -352,8 +347,6 @@ export class Vector { (proto as any).length = 0; (proto as any).stride = 1; (proto as any).numChildren = 0; - (proto as any)._nullCount = -1; - (proto as any)._byteLength = -1; (proto as any)._offsets = new Uint32Array([0]); (proto as any)[Symbol.isConcatSpreadable] = true; diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 93b4c0c495ca9..654134c6dff04 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -111,7 +111,19 @@ function indexOfNull(data: Data, fromIndex?: number): num /** @ignore */ function indexOfValue(data: Data, searchElement?: T['TValue'] | null, fromIndex?: number): number { if (searchElement === undefined) { return -1; } - if (searchElement === null) { return indexOfNull(data, fromIndex); } + if (searchElement === null) { + switch (data.typeId) { + // Unions don't have a nullBitmap of its own, so compare the `searchElement` to `get()`. + case Type.Union: + break; + // Dictionaries do have a nullBitmap, but their dictionary could also have null elements. + case Type.Dictionary: + break; + // All other types can iterate the null bitmap + default: + return indexOfNull(data, fromIndex); + } + } const get = getVisitor.getVisitFn(data); const compare = createElementComparator(searchElement); for (let i = (fromIndex || 0) - 1, n = data.length; ++i < n;) { diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index 9aae1c296893d..d83edfc24fbd8 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -72,7 +72,7 @@ export class JSONTypeAssembler extends Visitor { public visitUnion({ typeId, mode, typeIds }: T) { return { 'name': ArrowType[typeId].toLowerCase(), - 'mode': UnionMode[mode], + 'mode': UnionMode[mode].toUpperCase(), 'typeIds': [...typeIds] }; } diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 270e31a2c6ab7..7a617f4afe2c4 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -35,7 +35,7 @@ export interface JSONVectorAssembler extends Visitor { visit(field: Field, node: Data): Record; visitMany(fields: Field[], nodes: readonly Data[]): Record[]; - getVisitFn(node: Vector | Data): (data: Data) => { name: string; count: number; VALIDITY: (0 | 1)[]; DATA?: any[]; OFFSET?: number[]; TYPE?: number[]; children?: any[] }; + getVisitFn(node: Vector | Data): (data: Data) => { name: string; count: number; VALIDITY: (0 | 1)[]; DATA?: any[]; OFFSET?: number[]; TYPE_ID?: number[]; children?: any[] }; visitNull(data: Data): Record; visitBool(data: Data): { DATA: boolean[] }; @@ -50,7 +50,7 @@ export interface JSONVectorAssembler extends Visitor { visitDecimal(data: Data): { DATA: string[] }; visitList(data: Data): { children: any[]; OFFSET: number[] }; visitStruct(data: Data): { children: any[] }; - visitUnion(data: Data): { children: any[]; TYPE: number[] }; + visitUnion(data: Data): { children: any[]; TYPE_ID: number[] }; visitInterval(data: Data): { DATA: number[] }; visitFixedSizeList(data: Data): { children: any[] }; visitMap(data: Data): { children: any[] }; @@ -75,7 +75,8 @@ export class JSONVectorAssembler extends Visitor { return { 'name': name, 'count': length, - 'VALIDITY': DataType.isNull(type) ? undefined + 'VALIDITY': (DataType.isNull(type) || DataType.isUnion(type)) + ? undefined : nullCount <= 0 ? Array.from({ length }, () => 1) : [...new BitIterator(nullBitmap, offset, length, null, getBit)], ...super.visit(data.clone(type, offset, length, 0, buffers)) @@ -137,7 +138,7 @@ export class JSONVectorAssembler extends Visitor { } public visitUnion(data: Data) { return { - 'TYPE': [...data.typeIds], + 'TYPE_ID': [...data.typeIds], 'OFFSET': data.type.mode === UnionMode.Dense ? [...data.valueOffsets] : undefined, 'children': this.visitMany(data.type.children, data.children) }; diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index a9415b0e0c1dd..dbf778c4c3631 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -77,18 +77,23 @@ export class VectorAssembler extends Visitor { } const { type } = data; if (!DataType.isDictionary(type)) { - const { length, nullCount } = data; + const { length } = data; if (length > 2147483647) { /* istanbul ignore next */ throw new RangeError('Cannot write arrays larger than 2^31 - 1 in length'); } - if (!DataType.isNull(type)) { - addBuffer.call(this, nullCount <= 0 - ? new Uint8Array(0) // placeholder validity buffer - : truncateBitmap(data.offset, length, data.nullBitmap) - ); + if (DataType.isUnion(type)) { + this.nodes.push(new FieldNode(length, 0)); + } else { + const { nullCount } = data; + if (!DataType.isNull(type)) { + addBuffer.call(this, nullCount <= 0 + ? new Uint8Array(0) // placeholder validity buffer + : truncateBitmap(data.offset, length, data.nullBitmap) + ); + } + this.nodes.push(new FieldNode(length, nullCount)); } - this.nodes.push(new FieldNode(length, nullCount)); } return super.visit(data); } @@ -141,31 +146,30 @@ function assembleUnion(this: VectorAssembler, data: Data) { // A sliced Dense Union is an unpleasant case. Because the offsets are different for // each child vector, we need to "rebase" the valueOffsets for each child // Union typeIds are not necessary 0-indexed - const maxChildTypeId = typeIds.reduce((x, y) => Math.max(x, y), typeIds[0]); - const childLengths = new Int32Array(maxChildTypeId + 1); - // Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet - const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1); const shiftedOffsets = new Int32Array(length); + const childOffsets = Object.create(null) as Record; + const childLengths = Object.create(null) as Record; // If we have a non-zero offset, then the value offsets do not start at // zero. We must a) create a new offsets array with shifted offsets and // b) slice the values array accordingly - const unshiftedOffsets = rebaseValueOffsets(-valueOffsets[0], length, valueOffsets); for (let typeId, shift, index = -1; ++index < length;) { - if ((shift = childOffsets[typeId = typeIds[index]]) === -1) { - shift = childOffsets[typeId] = unshiftedOffsets[typeId]; + if ((typeId = typeIds[index]) === undefined) { + continue; + } + if ((shift = childOffsets[typeId]) === undefined) { + shift = childOffsets[typeId] = valueOffsets[index]; } - shiftedOffsets[index] = unshiftedOffsets[index] - shift; - ++childLengths[typeId]; + shiftedOffsets[index] = valueOffsets[index] - shift; + childLengths[typeId] = (childLengths[typeId] ?? 0) + 1; } addBuffer.call(this, shiftedOffsets); // Slice and visit children accordingly - for (let child: Data | null, childIndex = -1, numChildren = type.children.length; ++childIndex < numChildren;) { - if (child = data.children[childIndex]) { - const typeId = type.typeIds[childIndex]; - const childLength = Math.min(length, childLengths[typeId]); - this.visit(child.slice(childOffsets[typeId], childLength)); - } - } + this.visitMany(data.children.map((child, childIndex) => { + const typeId = type.typeIds[childIndex]; + const childOffset = childOffsets[typeId]; + const childLength = childLengths[typeId]; + return child.slice(childOffset, Math.min(length, childLength)); + })); } } return this; @@ -198,12 +202,11 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; - const firstOffset = valueOffsets[0]; - const lastOffset = valueOffsets[length]; - const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset); + const { [0]: begin, [length]: end } = valueOffsets; + const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-valueOffsets[0], length, valueOffsets)); // valueOffsets buffer first - addBuffer.call(this, values.subarray(firstOffset, firstOffset + byteLength)); // sliced values buffer second + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first + addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second return this; } @@ -212,7 +215,10 @@ function assembleListVector(this: VectorA const { length, valueOffsets } = data; // If we have valueOffsets (MapVector, ListVector), push that buffer first if (valueOffsets) { - addBuffer.call(this, rebaseValueOffsets(valueOffsets[0], length, valueOffsets)); + const { [0]: begin, [length]: end } = valueOffsets; + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); + // Then insert the List's values child + return this.visit(data.children[0].slice(begin, end - begin)); } // Then insert the List's values child return this.visit(data.children[0]); diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index 63431463a9623..cb4bc2829274f 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -24,7 +24,7 @@ import { Visitor } from '../visitor.js'; import { packBools } from '../util/bit.js'; import { encodeUtf8 } from '../util/utf8.js'; import { Int64, Int128 } from '../util/int.js'; -import { UnionMode, DateUnit } from '../enum.js'; +import { UnionMode, DateUnit, MetadataVersion } from '../enum.js'; import { toArrayBufferView } from '../util/buffer.js'; import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; @@ -42,12 +42,14 @@ export class VectorLoader extends Visitor { private buffers: BufferRegion[]; private buffersIndex = -1; private dictionaries: Map>; - constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>) { + private readonly metadataVersion: MetadataVersion; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5) { super(); this.bytes = bytes; this.nodes = nodes; this.buffers = buffers; this.dictionaries = dictionaries; + this.metadataVersion = metadataVersion; } public visit(node: Field | T): Data { @@ -93,14 +95,19 @@ export class VectorLoader extends Visitor { public visitStruct(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), children: this.visitMany(type.children) }); } - public visitUnion(type: T) { - return type.mode === UnionMode.Sparse ? this.visitSparseUnion(type as type.SparseUnion) : this.visitDenseUnion(type as type.DenseUnion); + public visitUnion(type: T, { length, nullCount } = this.nextFieldNode()) { + if (this.metadataVersion < MetadataVersion.V5) { + this.readNullBitmap(type, nullCount); + } + return type.mode === UnionMode.Sparse + ? this.visitSparseUnion(type as type.SparseUnion, { length, nullCount }) + : this.visitDenseUnion(type as type.DenseUnion, { length, nullCount }); } public visitDenseUnion(type: T, { length, nullCount } = this.nextFieldNode()) { - return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), typeIds: this.readTypeIds(type), valueOffsets: this.readOffsets(type), children: this.visitMany(type.children) }); + return makeData({ type, length, nullCount, typeIds: this.readTypeIds(type), valueOffsets: this.readOffsets(type), children: this.visitMany(type.children) }); } public visitSparseUnion(type: T, { length, nullCount } = this.nextFieldNode()) { - return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), typeIds: this.readTypeIds(type), children: this.visitMany(type.children) }); + return makeData({ type, length, nullCount, typeIds: this.readTypeIds(type), children: this.visitMany(type.children) }); } public visitDictionary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type.indices), dictionary: this.readDictionary(type) }); @@ -133,8 +140,8 @@ export class VectorLoader extends Visitor { /** @ignore */ export class JSONVectorLoader extends VectorLoader { private sources: any[][]; - constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>) { - super(new Uint8Array(0), nodes, buffers, dictionaries); + constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); this.sources = sources; } protected readNullBitmap(_type: T, nullCount: number, { offset } = this.nextBufferRange()) { diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index a84344e1a0395..a03b22c54c770 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -304,7 +304,7 @@ function generateFloat(this: TestDataVectorGenerator, type: T, function generateUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, undefined, undefined, nullCount != 0); + const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) @@ -326,7 +326,7 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, undefined, undefined, nullCount != 0); + const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); const values = [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) .map((length) => length == null ? null : randomBytes(length)); @@ -425,7 +425,7 @@ function generateList(this: TestDataVectorGenerator, type: T, le const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, childVec.length, stride); + const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] @@ -486,7 +486,8 @@ function generateUnion(this: TestDataVectorGenerator, type: T, if (!children) { if (type.mode === UnionMode.Sparse) { - children = type.children.map((f) => this.visit(f.type, length, nullCount)); + const childNullCount = nullCount && Math.trunc(length / nullCount); + children = type.children.map((f) => this.visit(f.type, length, childNullCount)); } else { const childLength = Math.ceil(length / numChildren); const childNullCount = Math.trunc(nullCount / childLength); @@ -498,7 +499,6 @@ function generateUnion(this: TestDataVectorGenerator, type: T, const typeIdsBuffer = new Int8Array(length); const vecs = children.flatMap(({ vector }) => vector.data); const cols = children.map(({ values }) => values); - const nullBitmap = createBitmap(length, nullCount); const typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => { return (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex; }, Object.create(null) as { [key: number]: number }); @@ -507,37 +507,31 @@ function generateUnion(this: TestDataVectorGenerator, type: T, const values = memoize(() => { const values = [] as any[]; const childValues = cols.map((x) => x()); - iterateBitmap(length, nullBitmap, (i, valid) => { - values[i] = !valid ? null : childValues[typeIdToChildIndex[typeIdsBuffer[i]]][i]; - }); + for (let i = -1; ++i < length;) { + values[i] = childValues[typeIdToChildIndex[typeIdsBuffer[i]]][i]; + } return values; }); - iterateBitmap(length, nullBitmap, (i, valid) => { - typeIdsBuffer[i] = !valid ? 0 : typeIds[Math.trunc(rand() * numChildren)]; - }); - return { values, vector: new Vector([makeData({ type: type as SparseUnion, length, nullCount, nullBitmap, typeIds: typeIdsBuffer, children: vecs })]) } as GeneratedVector; + for (let i = -1; ++i < length;) { + typeIdsBuffer[i] = typeIds[Math.trunc(rand() * numChildren)]; + } + return { values, vector: new Vector([makeData({ type: type as SparseUnion, length, nullCount: -1, typeIds: typeIdsBuffer, children: vecs })]) } as GeneratedVector; } const valueOffsets = new Int32Array(length); const values = memoize(() => { const values = [] as any[]; const childValues = cols.map((x) => x()); - iterateBitmap(length, nullBitmap, (i, valid) => { - values[i] = !valid ? null : childValues[typeIdToChildIndex[typeIdsBuffer[i]]][valueOffsets[i]]; - }); - return values; - }); - iterateBitmap(length, nullBitmap, (i, valid) => { - if (!valid) { - valueOffsets[i] = 0; - typeIdsBuffer[i] = 0; - } else { - const colIdx = Math.trunc(rand() * numChildren); - valueOffsets[i] = Math.trunc(i / numChildren); - typeIdsBuffer[i] = typeIds[colIdx]; + for (let i = -1; ++i < length;) { + values[i] = childValues[typeIdToChildIndex[typeIdsBuffer[i]]][valueOffsets[i]]; } + return values; }); - return { values, vector: new Vector([makeData({ type: type as DenseUnion, length, nullCount, nullBitmap, typeIds: typeIdsBuffer, valueOffsets, children: vecs })]) } as GeneratedVector; + for (let i = -1; ++i < length;) { + valueOffsets[i] = Math.trunc(i / numChildren); + typeIdsBuffer[i] = typeIds[i % numChildren]; + } + return { values, vector: new Vector([makeData({ type: type as DenseUnion, length, nullCount: -1, typeIds: typeIdsBuffer, valueOffsets, children: vecs })]) } as GeneratedVector; } function generateStruct(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), children = type.children.map((f) => this.visit(f.type, length, nullCount))): GeneratedVector { @@ -569,7 +563,7 @@ function generateMap(this: TestDataVectorGenerator, const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, childVec.length, stride); + const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues: { key: K; value: V }[] = child.values(); const values: (Record | null)[] = [...valueOffsets.slice(1)] @@ -648,14 +642,14 @@ function createBitmap(length: number, nullCount: number) { return bytes; } -function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, max = Number.POSITIVE_INFINITY, stride = 20, allowEmpty = true) { +function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { const offsets = new Int32Array(length + 1); iterateBitmap(length, nullBitmap, (i, valid) => { if (!valid) { offsets[i + 1] = offsets[i]; } else { do { - offsets[i + 1] = Math.min(max, offsets[i] + Math.max(10, Math.trunc(rand() * stride))); + offsets[i + 1] = offsets[i] + Math.min(max, Math.max(min, Math.trunc(rand() * max))); } while (!allowEmpty && offsets[i + 1] === offsets[i]); } }); diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index 948b7af70659e..90cf0d598aa6f 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -54,7 +54,7 @@ describe('Generated Test Data', () => { describe('List', () => { validateVector(generate.list()); }); describe('Struct', () => { validateVector(generate.struct()); }); describe('DenseUnion', () => { validateVector(generate.denseUnion()); }); - // describe('SparseUnion', () => { validateVector(generate.sparseUnion()); }); + describe('SparseUnion', () => { validateVector(generate.sparseUnion()); }); describe('Dictionary', () => { validateVector(generate.dictionary()); }); describe('IntervalDayTime', () => { validateVector(generate.intervalDayTime()); }); describe('IntervalYearMonth', () => { validateVector(generate.intervalYearMonth()); }); diff --git a/js/test/unit/generated-data-validators.ts b/js/test/unit/generated-data-validators.ts index 2a73a39cebd0a..52f642d2a6e89 100644 --- a/js/test/unit/generated-data-validators.ts +++ b/js/test/unit/generated-data-validators.ts @@ -162,6 +162,8 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { actual = vector.indexOf(value); expected = values.findIndex(compare(value)); expect(actual).toBe(expected); + // eslint-disable-next-line jest/prefer-to-contain + expect(vector.includes(value)).toBe(true); } // I would be pretty surprised if randomatic ever generates these values expect(vector.indexOf('purple elephants')).toBe(-1); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index cadacf21eed39..645fcc60f8d90 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -110,7 +110,9 @@ describe('Visitor', () => { test(`visits Dictionary types`, () => validateBasicVisitor(new Dictionary(null as any, null as any))); test(`visits Interval types`, () => validateBasicVisitor(new Interval(0))); test(`visits FixedSizeList types`, () => validateBasicVisitor(new FixedSizeList(2, null as any))); - test(`visits Map types`, () => validateBasicVisitor(new Map_(new Field('', new Struct<{ key: Int; value: Int }>([] as any[]))))); + test(`visits Map types`, () => validateBasicVisitor(new Map_(new Field('', new Struct<{ key: Utf8; value: Int }>([ + new Field('key', new Utf8()), new Field('value', new Int8()) + ] as any[]))))); function validateBasicVisitor(type: T) { const visitor = new BasicVisitor(); const result = visitor.visit(type); @@ -156,7 +158,9 @@ describe('Visitor', () => { test(`visits IntervalDayTime types`, () => validateFeatureVisitor(new IntervalDayTime())); test(`visits IntervalYearMonth types`, () => validateFeatureVisitor(new IntervalYearMonth())); test(`visits FixedSizeList types`, () => validateFeatureVisitor(new FixedSizeList(2, null as any))); - test(`visits Map types`, () => validateFeatureVisitor(new Map_(new Field('', new Struct<{ key: Int; value: Int }>([] as any[]))))); + test(`visits Map types`, () => validateFeatureVisitor(new Map_(new Field('', new Struct<{ key: Utf8; value: Int }>([ + new Field('key', new Utf8()), new Field('value', new Int8()) + ] as any[]))))); function validateFeatureVisitor(type: T) { const visitor = new FeatureVisitor(); diff --git a/js/tsconfig.json b/js/tsconfig.json index 81324ed2c9eb0..fa352302eaadc 100644 --- a/js/tsconfig.json +++ b/js/tsconfig.json @@ -17,5 +17,16 @@ "apache-arrow/*": ["src/*"] } }, - "include": ["src/**/*.ts", "test/**/*.ts", "perf/**/*.ts"] + "include": ["src/**/*.ts", "test/**/*.ts", "perf/**/*.ts"], + "ts-node": { + "transpileOnly": true, + "experimentalResolver": true, + "compilerOptions": { + "module": "CommonJS" + }, + "moduleTypes": { + "index.ts": "cjs", + "src/**/*": "cjs", + } + } }