diff --git a/js/perf/index.js b/js/perf/index.js index 0be4db3084dbf..d31b6430ec871 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -41,17 +41,23 @@ for (let { name, buffers} of config) { suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } -for (let {name, buffers, tests} of require('./table_config')) { +for (let {name, buffers, countBys, counts} of require('./table_config')) { + const table = Table.from(buffers); + + const dfCountBySuite = new Benchmark.Suite(`DataFrame Count By "${name}"`, { async: true }); + for (countBy of countBys) { + dfCountBySuite.add(createDataFrameCountByTest(table, countBy)); + } + const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter-Scan Count "${name}"`, { async: true }); const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); - const table = Table.from(buffers); - for (test of tests) { + for (test of counts) { dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value)) dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) } - suites.push(dfFilterCountSuite, dfDirectCountSuite) + suites.push(dfCountBySuite, dfFilterCountSuite, dfDirectCountSuite) } console.log('Running apache-arrow performance tests...\n'); @@ -167,6 +173,18 @@ function createDataFrameDirectCountTest(table, column, test, value) { }; } +function createDataFrameCountByTest(table, column) { + let colidx = table.columns.findIndex((c)=>c.name === column); + + return { + async: true, + name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}`, + fn() { + table.countBy(col(column)); + } + }; +} + function createDataFrameFilterCountTest(table, column, test, value) { let colidx = table.columns.findIndex((c)=>c.name === column); let df; diff --git a/js/perf/table_config.js b/js/perf/table_config.js index 3c045e4571e44..e3c332c870f38 100644 --- a/js/perf/table_config.js +++ b/js/perf/table_config.js @@ -22,7 +22,10 @@ const glob = require('glob'); const config = []; const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); -tests = { +countBys = { + "tracks": ['origin', 'destination'] +} +counts = { "tracks": [ {col: 'lat', test: 'gteq', value: 0 }, {col: 'lng', test: 'gteq', value: 0 }, @@ -32,11 +35,12 @@ tests = { for (const filename of filenames) { const { name } = path.parse(filename); - if (name in tests) { + if (name in counts) { config.push({ name, buffers: [fs.readFileSync(filename)], - tests: tests[name] + countBys: countBys[name], + counts: counts[name], }); } } diff --git a/js/src/table.ts b/js/src/table.ts index 620a4a701c80f..6f312746f2c71 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -16,8 +16,10 @@ // under the License. import { Vector } from './vector/vector'; +import { DictionaryVector } from './vector/dictionary'; +import { Uint32Vector } from './vector/numeric'; import { read, readAsync } from './reader/arrow'; -import { Predicate } from './predicate'; +import { Col, Predicate } from './predicate'; export type NextFunc = (idx: number, cols: Vector[]) => void; @@ -40,6 +42,7 @@ export interface DataFrame { filter(predicate: Predicate): DataFrame; scan(next: NextFunc): void; count(): number; + countBy(col: (Col|string)): Table; } function columnsFromBatches(batches: Vector[][]) { @@ -111,6 +114,40 @@ export class Table implements DataFrame { count(): number { return this.lengths.reduce((acc, val) => acc + val); } + countBy(count_by: (Col|string)): Table { + if (count_by instanceof String) { + count_by = new Col(count_by); + } + + // the last batch will have the most complete dictionary, use it's data + // vector as our count by keys + count_by.bind(this.batches[this.batches.length - 1]); + if (!(count_by.vector instanceof DictionaryVector)) { + throw new Error("countBy currently only supports dictionary-encoded columns"); + } + + let keys: Vector = (count_by.vector as DictionaryVector).data; + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + let counts: Uint32Array = new Uint32Array(keys.length); + + + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + count_by.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + let key = (count_by.vector as DictionaryVector).getKey(idx) + if (key !== null) { counts[key]++; } + } + } + + return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; @@ -177,4 +214,38 @@ class FilteredDataFrame implements DataFrame { this.predicate.and(predicate) ); } + + countBy(count_by: (Col|string)): Table { + if (count_by instanceof String) { + count_by = new Col(count_by); + } + + // the last batch will have the most complete dictionary, use it's data + // vector as our count by keys + count_by.bind(this.parent.batches[this.parent.batches.length - 1]); + if (!(count_by.vector instanceof DictionaryVector)) { + throw new Error("countBy currently only supports dictionary-encoded columns"); + } + + let keys: Vector = (count_by.vector as DictionaryVector).data; + let counts: Uint32Array = new Uint32Array(keys.length); + + + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; + + // load batches + const columns = this.parent.batches[batch]; + const predicate = this.predicate.bind(columns); + count_by.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + let key = (count_by.vector as DictionaryVector).getKey(idx) + if (key !== null && predicate(idx, columns)) { counts[key]++; } + } + } + + return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + } }