diff --git a/js/src/table.ts b/js/src/table.ts index 554844be2c8c4..d4fe5a93223d8 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -54,14 +54,17 @@ function columnsFromBatches(batches: Vector[][]) { export class Table implements DataFrame { static from(sources?: Iterable | object | string) { - let batches: Vector[][] = [[]]; + let batches: Vector[][] = []; if (sources) { - batches = Array.from(read(sources)); + batches = []; + for (let batch of read(sources)) { + batches.push(batch); + } } return new Table({ batches }); } static async fromAsync(sources?: AsyncIterable) { - let batches: Vector[][] = [[]]; + let batches: Vector[][] = []; if (sources) { batches = []; for await (let batch of readAsync(sources)) { @@ -119,18 +122,17 @@ export class Table implements DataFrame { count_by = new Col(count_by); } - // the last batch will have the most complete dictionary, use it's data - // vector as our count by keys + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary count_by.bind(this.batches[this.batches.length - 1]); if (!(count_by.vector instanceof DictionaryVector)) { - throw new Error("countBy currently only supports dictionary-encoded columns"); + throw new Error('countBy currently only supports dictionary-encoded columns'); } - let keys: Vector = (count_by.vector as DictionaryVector).data; + let data: Vector = (count_by.vector as DictionaryVector).data; // TODO: Adjust array byte width based on overall length // (e.g. if this.length <= 255 use Uint8Array, etc...) - let counts: Uint32Array = new Uint32Array(keys.length); - + let counts: Uint32Array = new Uint32Array(data.length); for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; @@ -138,15 +140,16 @@ export class Table implements DataFrame { // load batches const columns = this.batches[batch]; count_by.bind(columns); + const keys: Vector = (count_by.vector as DictionaryVector).keys; // yield all indices for (let idx = -1; ++idx < length;) { - let key = (count_by.vector as DictionaryVector).getKey(idx) + let key = keys.get(idx); if (key !== null) { counts[key]++; } } } - return new CountByResult(keys, new Uint32Vector({data: counts})) + return new CountByResult(data, new Uint32Vector({data: counts})); } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { @@ -220,16 +223,17 @@ class FilteredDataFrame implements DataFrame { count_by = new Col(count_by); } - // the last batch will have the most complete dictionary, use it's data - // vector as our count by keys + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary count_by.bind(this.parent.batches[this.parent.batches.length - 1]); if (!(count_by.vector instanceof DictionaryVector)) { - throw new Error("countBy currently only supports dictionary-encoded columns"); + throw new Error('countBy currently only supports dictionary-encoded columns'); } - let keys: Vector = (count_by.vector as DictionaryVector).data; - let counts: Uint32Array = new Uint32Array(keys.length); - + const data: Vector = (count_by.vector as DictionaryVector).data; + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(data.length); for (let batch = -1; ++batch < this.parent.lengths.length;) { const length = this.parent.lengths[batch]; @@ -238,28 +242,29 @@ class FilteredDataFrame implements DataFrame { const columns = this.parent.batches[batch]; const predicate = this.predicate.bind(columns); count_by.bind(columns); + const keys: Vector = (count_by.vector as DictionaryVector).keys; // yield all indices for (let idx = -1; ++idx < length;) { - let key = (count_by.vector as DictionaryVector).getKey(idx) + let key = keys.get(idx); if (key !== null && predicate(idx, columns)) { counts[key]++; } } } - return new CountByResult(keys, new Uint32Vector({data: counts})) + return new CountByResult(data, new Uint32Vector({data: counts})); } } export class CountByResult extends Table implements DataFrame { - constructor(readonly keys: Vector, readonly counts: Vector) { - super({batches: [[keys, counts]]}); + constructor(readonly values: Vector, readonly counts: Vector) { + super({batches: [[values, counts]]}); } asJSON(): Object { let result: {[key: string]: number|null} = {}; for (let i = -1; ++i < this.length;) { - result[this.keys.get(i)] = this.counts.get(i); + result[this.values.get(i)] = this.counts.get(i); } return result; diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 9b19c9e56a243..2b818d7ff70ea 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -26,90 +26,90 @@ const { describe(`Table`, () => { describe(`single record batch`, () => { const table = Table.from({ - "schema": { - "fields": [ + 'schema': { + 'fields': [ { - "name": "f32", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "i32", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "dictionary", - "type": { - "name": "utf8" + 'name': 'dictionary', + 'type': { + 'name': 'utf8' }, - "nullable": false, - "children": [], - "dictionary": { - "id": 0, - "indexType": { - "name": "int", - "isSigned": true, - "bitWidth": 8 + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 }, - "isOrdered": false + 'isOrdered': false } } ] }, - "dictionaries": [{ - "id": 0, - "data": { - "count": 3, - "columns": [ + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ { - "name": "DICT0", - "count": 3, - "VALIDITY": [], - "OFFSET": [ + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ 0, 1, 2, 3 ], - "DATA": [ - "a", - "b", - "c", + 'DATA': [ + 'a', + 'b', + 'c', ] } ] } }], - "batches": [{ - "count": 7, - "columns": [ + 'batches': [{ + 'count': 7, + 'columns': [ { - "name": "f32", - "count": 7, - "VALIDITY": [], - "DATA": [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] + 'name': 'f32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] }, { - "name": "i32", - "count": 7, - "VALIDITY": [], - "DATA": [-1, 1, -1, 1, -1, 1, -1] + 'name': 'i32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-1, 1, -1, 1, -1, 1, -1] }, { - "name": "dictionary", - "count": 7, - "VALIDITY": [], - "DATA": [0, 1, 2, 0, 1, 2, 0] + 'name': 'dictionary', + 'count': 7, + 'VALIDITY': [], + 'DATA': [0, 1, 2, 0, 1, 2, 0] } ] }] @@ -125,7 +125,7 @@ describe(`Table`, () => { [new Float32Array([ 0.1])[0], -1, 'b'], [new Float32Array([ 0.2])[0], 1, 'c'], [new Float32Array([ 0.3])[0], -1, 'a'] - ] + ]; test(`has the correct length`, () => { expect(table.length).toEqual(values.length); }); @@ -143,7 +143,7 @@ describe(`Table`, () => { test(`scans expected values`, () => { let expected_idx = 0; table.scan((idx, cols) => { - expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + expect(cols.map((c) => c.get(idx))).toEqual(values[expected_idx++]); }); }); test(`count() returns the correct length`, () => { @@ -179,137 +179,140 @@ describe(`Table`, () => { 'c': 1, }); }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); }); describe(`multiple record batches`, () => { const table = Table.from({ - "schema": { - "fields": [ + 'schema': { + 'fields': [ { - "name": "f32", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "i32", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "dictionary", - "type": { - "name": "utf8" + 'name': 'dictionary', + 'type': { + 'name': 'utf8' }, - "nullable": false, - "children": [], - "dictionary": { - "id": 0, - "indexType": { - "name": "int", - "isSigned": true, - "bitWidth": 8 + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 }, - "isOrdered": false + 'isOrdered': false } } ] }, - "dictionaries": [{ - "id": 0, - "data": { - "count": 3, - "columns": [ + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ { - "name": "DICT0", - "count": 3, - "VALIDITY": [], - "OFFSET": [ + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ 0, 1, 2, 3 ], - "DATA": [ - "a", - "b", - "c", + 'DATA': [ + 'a', + 'b', + 'c', ] } ] } }], - "batches": [{ - "count": 3, - "columns": [ + 'batches': [{ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [-0.3, -0.2, -0.1] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [-1, 1, -1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }, { - "count": 3, - "columns": [ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [0, 0.1, 0.2] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 0.1, 0.2] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [1, -1, 1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [1, -1, 1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }, { - "count": 3, - "columns": [ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [0.3, 0.2, 0.1] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0.3, 0.2, 0.1] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [-1, 1, -1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }] @@ -327,7 +330,7 @@ describe(`Table`, () => { [new Float32Array([ 0.3])[0], -1, 'a'], [new Float32Array([ 0.2])[0], 1, 'b'], [new Float32Array([ 0.1])[0], -1, 'c'], - ] + ]; test(`has the correct length`, () => { expect(table.length).toEqual(values.length); }); @@ -345,7 +348,7 @@ describe(`Table`, () => { test(`scans expected values`, () => { let expected_idx = 0; table.scan((idx, cols) => { - expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + expect(cols.map((c) => c.get(idx))).toEqual(values[expected_idx++]); }); }); test(`count() returns the correct length`, () => { @@ -381,5 +384,8 @@ describe(`Table`, () => { 'c': 1, }); }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); }); });