From 99e58da5d58b8587657413b4bcca14a773d6de36 Mon Sep 17 00:00:00 2001 From: Paul Taylor Date: Thu, 11 Jan 2018 18:35:05 -0500 Subject: [PATCH 01/19] ARROW-1979: [JS] Fix JS builds hanging in es2015 Also fixes [ARROW-1903](https://issues.apache.org/jira/browse/ARROW-1903) Author: Paul Taylor Closes #1471 from trxcllnt/fix-js-es2015-builds and squashes the following commits: 62db3381 [Paul Taylor] update dependencies and add es6+ umd targets to jest transform ignore patterns to fix ci 6ff18e94 [Paul Taylor] ship es2015 commonJS in main package to avoid confusion 74e828af [Paul Taylor] fix typings issues (ARROW-1903) --- js/README.md | 2 +- js/gulp/arrow-task.js | 8 +++--- js/gulp/package-task.js | 6 ++--- js/gulp/test-task.js | 2 +- js/package.json | 38 +++++++++++++-------------- js/src/text-encoding-utf-8.d.ts | 4 --- js/src/vector/numeric.ts | 7 ++--- js/src/vector/virtual.ts | 2 +- js/test/Arrow.ts | 2 +- js/test/integration/validate-tests.ts | 2 +- 10 files changed, 35 insertions(+), 38 deletions(-) delete mode 100644 js/src/text-encoding-utf-8.d.ts diff --git a/js/README.md b/js/README.md index e58d335cd0d42..b427923e37ea1 100644 --- a/js/README.md +++ b/js/README.md @@ -178,7 +178,7 @@ The base `apache-arrow` package includes all the compilation targets for conveni The targets are also published under the `@apache-arrow` namespace: ```sh -npm install apache-arrow # <-- combined es5/CommonJS + UMD, es2015/ESModules + UMD, and TypeScript package +npm install apache-arrow # <-- combined es5/UMD, es2015/CommonJS/ESModules/UMD, and TypeScript package npm install @apache-arrow/ts # standalone TypeScript package npm install @apache-arrow/es5-cjs # standalone es5/CommonJS package npm install @apache-arrow/es5-esm # standalone es5/ESModules package diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js index cc33ee14497b6..d1e8046e67ab9 100644 --- a/js/gulp/arrow-task.js +++ b/js/gulp/arrow-task.js @@ -28,8 +28,8 @@ const { Observable, ReplaySubject } = require('rxjs'); const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) { const out = targetDir(target); - const srcGlob = `src/**/*.ts`; - const es5Glob = `${targetDir(`es5`, `cjs`)}/**/*.js`; + const dtsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.ts`; + const cjsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.js`; const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`; const es5UmdGlob = `${targetDir(`es5`, `umd`)}/**/*.js`; const es5UmdMaps = `${targetDir(`es5`, `umd`)}/**/*.map`; @@ -38,8 +38,8 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, forma const ch_ext = (ext) => gulpRename((p) => { p.extname = ext; }); const append = (ap) => gulpRename((p) => { p.basename += ap; }); return Observable.forkJoin( - observableFromStreams(gulp.src(srcGlob), gulp.dest(out)), // copy src ts files - observableFromStreams(gulp.src(es5Glob), gulp.dest(out)), // copy es5 cjs files + observableFromStreams(gulp.src(dtsGlob), gulp.dest(out)), // copy d.ts files + observableFromStreams(gulp.src(cjsGlob), gulp.dest(out)), // copy es2015 cjs files observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min` observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index fc959643503bd..2976d0ad45d09 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -46,8 +46,8 @@ const createMainPackageJson = (target, format) => (orig) => ({ name: npmPkgName, main: mainExport, module: `${mainExport}.mjs`, - browser: `${mainExport}.es5.min.js`, - [`browser:es2015`]: `${mainExport}.es2015.min.js`, + dist: `${mainExport}.es5.min.js`, + [`dist:es2015`]: `${mainExport}.es2015.min.js`, [`@std/esm`]: { esm: `mjs` } }); @@ -67,7 +67,7 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), { name: `${npmOrgName}/${packageName(target, format)}`, version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - browser: undefined, [`browser:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } + dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } ) ) ); diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index f21aaf2364d03..ab280b092635c 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -34,7 +34,7 @@ argv.update && jestArgv.push(`-u`); argv.verbose && jestArgv.push(`--verbose`); argv.coverage && jestArgv.push(`--coverage`); -const debugArgv = [`--runInBand`, `--env`, `jest-environment-node-debug`]; +const debugArgv = [`--runInBand`, `--env`, `node-debug`]; const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`)); const testOptions = { env: { ...process.env }, diff --git a/js/package.json b/js/package.json index 3903d1eedc442..d68e7a6279e61 100644 --- a/js/package.json +++ b/js/package.json @@ -49,10 +49,8 @@ "gulpfile.js", "npm-release.sh" ], - "peerDependencies": { - "command-line-usage": "4.0.1" - }, "dependencies": { + "@types/text-encoding-utf-8": "1.0.1", "command-line-args": "4.0.7", "command-line-usage": "4.0.2", "flatbuffers": "trxcllnt/flatbuffers-esm", @@ -61,45 +59,44 @@ "tslib": "1.8.1" }, "devDependencies": { - "@std/esm": "0.18.0", + "@std/esm": "0.19.1", "@types/flatbuffers": "1.6.5", "@types/glob": "5.0.34", - "@types/jest": "21.1.8", - "@types/node": "8.5.0", - "@types/text-encoding": "0.0.32", + "@types/jest": "22.0.1", + "@types/node": "9.3.0", "ast-types": "0.10.1", "benchmark": "2.1.4", "coveralls": "3.0.0", "del": "3.0.0", - "esdoc": "1.0.3", + "esdoc": "1.0.4", "esdoc-standard-plugin": "1.0.0", "glob": "7.1.2", - "google-closure-compiler": "20171203.0.0", + "google-closure-compiler": "20180101.0.0", "gulp": "github:gulpjs/gulp#6d71a658c61edb3090221579d8f97dbe086ba2ed", "gulp-json-transform": "0.4.5", "gulp-rename": "1.2.2", - "gulp-sourcemaps": "2.6.1", + "gulp-sourcemaps": "2.6.3", "gulp-transform-js-ast": "1.0.2", "gulp-typescript": "3.2.3", "ix": "2.3.4", - "jest": "21.2.1", + "jest": "22.0.5", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.5.1", + "lerna": "2.6.0", "lint-staged": "6.0.0", - "merge2": "1.2.0", + "merge2": "1.2.1", "mkdirp": "0.5.1", "npm-run-all": "4.1.2", "pump": "1.0.2", "rimraf": "2.6.2", - "rxjs": "5.5.5", + "rxjs": "5.5.6", "shx": "0.2.2", "source-map-loader": "0.2.3", "trash": "4.2.1", - "ts-jest": "21.2.4", - "tslint": "5.8.0", + "ts-jest": "22.0.1", + "tslint": "5.9.1", "typescript": "2.6.2", - "uglifyjs-webpack-plugin": "1.1.2", + "uglifyjs-webpack-plugin": "1.1.6", "webpack": "3.10.0", "xml2js": "0.4.19" }, @@ -134,9 +131,12 @@ "/node_modules/" ], "transform": { - ".(ts|tsx)": "/node_modules/ts-jest/preprocessor.js", - ".(js|jsx)": "/node_modules/babel-jest/build/index.js" + ".(ts|tsx)": "./node_modules/ts-jest/preprocessor.js", + ".(js|jsx)": "./node_modules/babel-jest/build/index.js" }, + "transformIgnorePatterns": [ + "/node_modules/", "/(es2015|esnext)\/umd/" + ], "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" } } diff --git a/js/src/text-encoding-utf-8.d.ts b/js/src/text-encoding-utf-8.d.ts deleted file mode 100644 index 68ba4dfd9a346..0000000000000 --- a/js/src/text-encoding-utf-8.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -declare module 'text-encoding-utf-8' { - import * as TextEncoding from 'text-encoding'; - export = TextEncoding; -} diff --git a/js/src/vector/numeric.ts b/js/src/vector/numeric.ts index fe4767809f465..830d6082bcc4a 100644 --- a/js/src/vector/numeric.ts +++ b/js/src/vector/numeric.ts @@ -34,10 +34,10 @@ export class NumericVector extends Vector { concat(...vectors: Vector[]): Vector { return new VirtualVector(this.data.constructor as TypedArrayConstructor, this, ...vectors); } - slice(start?: number, end?: number) { + slice(start?: number, end?: number): R { const { data, stride } = this, from = start! | 0; const to = end === undefined ? data.length : Math.max(end | 0, from); - return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0); + return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0) as any as R; } } @@ -49,7 +49,8 @@ export class FixedWidthNumericVector extends Numer export class BoolVector extends NumericVector { static pack(values: Iterable) { - let xs = [], n, i = 0; + let n = 0, i = 0; + let xs: number[] = []; let bit = 0, byte = 0; for (const value of values) { value && (byte |= 1 << bit); diff --git a/js/src/vector/virtual.ts b/js/src/vector/virtual.ts index 6ec3a8eef9f4d..42db78706db51 100644 --- a/js/src/vector/virtual.ts +++ b/js/src/vector/virtual.ts @@ -93,7 +93,7 @@ export class VirtualVector implements Vector { // this is a significant improvement as we avoid the memcpy 🎉 if ((source.length / vector.stride | 0) < total) { let vectorsLength = vectors.length; - let count = 0, length = 0, sources = []; + let count = 0, length = 0, sources = [] as any[]; do { sources.push(source); length += source.length; diff --git a/js/test/Arrow.ts b/js/test/Arrow.ts index 87641e52bf3f8..f2c4e930f92e4 100644 --- a/js/test/Arrow.ts +++ b/js/test/Arrow.ts @@ -16,7 +16,7 @@ // under the License. /* tslint:disable */ -// Dynamically load an Ix target build based on command line arguments +// Dynamically load an Arrow target build based on command line arguments const path = require('path'); const target = process.env.TEST_TARGET!; diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts index c8778ba2b33c2..c612d62ad0c04 100644 --- a/js/test/integration/validate-tests.ts +++ b/js/test/integration/validate-tests.ts @@ -37,7 +37,7 @@ const arrowBuffers: Uint8Array[] = [fs.readFileSync(arrowPath)]; import Arrow from '../Arrow'; import { zip } from 'ix/iterable/zip'; -import { toArray } from 'ix/iterable/toArray'; +import { toArray } from 'ix/iterable/toarray'; const { Table, read } = Arrow; From a1edac2095b43fa93cfdff99f1aee900f68af4cf Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Fri, 5 Jan 2018 11:47:42 -0500 Subject: [PATCH 02/19] Add perf tests for table scans --- js/generate.py | 36 +++++++++++++++++++ js/perf/index.js | 78 +++++++++++++++++++++++++++++++++++++++++ js/perf/table_config.js | 36 +++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 js/generate.py create mode 100644 js/perf/table_config.js diff --git a/js/generate.py b/js/generate.py new file mode 100644 index 0000000000000..bf663fb0b1f9f --- /dev/null +++ b/js/generate.py @@ -0,0 +1,36 @@ +import pyarrow as pa +import random +import numpy as np +import pandas as pd + + +cities = [u'Charlottesville', u'New York', u'San Francisco', u'Seattle', u'Terre Haute', u'Washington, DC'] + +def generate_batch(batch_len): + return pa.RecordBatch.from_arrays([ + pa.Array.from_pandas(pd.Series(np.random.uniform(-90,90,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Series(np.random.uniform(-180,180,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)) + ], ['lat', 'lng', 'origin', 'destination']) + +def write_record_batches(fd, batch_len, num_batches): + writer = pa.ipc.RecordBatchStreamWriter(fd, generate_batch(1).schema) + for batch in range(num_batches): + writer.write_batch(generate_batch(batch_len)) + + writer.close() + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='number of batches') + parser.add_argument('-n', '--num-batches', help='number of batches', type=int, default=10) + parser.add_argument('-b', '--batch-size', help='size of each batch', type=int, default=100000) + + args = parser.parse_args() + + print "Writing {} {}-element batches to '{}'".format(args.num_batches, args.batch_size, args.filename) + with open(args.filename, 'w') as fd: + write_record_batches(fd, args.batch_size, args.num_batches) diff --git a/js/perf/index.js b/js/perf/index.js index 9eac40e64ac71..03501913d8155 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -41,6 +41,21 @@ for (let { name, buffers} of config) { suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } +for (let {name, buffers, tests} of require('./table_config')) { + const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true }); + const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true }); + const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true }); + const table = Table.from(buffers); + + tableIterateSuite.add(createTableIterateTest(table)); + for (test of tests) { + tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value)) + vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value)) + } + + suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite) +} + console.log('Running apache-arrow performance tests...\n'); run(); @@ -109,3 +124,66 @@ function createGetByIndexTest(vector) { } }; } + +function createVectorCountByTest(vector, test, value) { + let op; + if (test == 'gteq') { + op = function () { + sum = 0; + for (cell of vector) { + sum += (cell >= value) + } + } + } else if (test == 'eq') { + op = function () { + sum = 0; + for (cell of vector) { + sum += (cell == value) + } + } + } else { + throw new Error(`Unrecognized test "$test"`); + } + + return { + async: true, + name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}, test: ${test}, value: ${value}`, + fn: op + }; +} + +function createTableIterateTest(table) { + let row; + return { + async: true, + name: `length: ${table.length}`, + fn() { for (row of table) {} } + }; +} + +function createTableCountByTest(table, column, test, value) { + let op; + if (test == 'gteq') { + op = function () { + sum = 0; + for (row of table) { + sum += (row.get(column) >= value) + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + for (row of table) { + sum += (row.get(column) == value) + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn: op + }; +} diff --git a/js/perf/table_config.js b/js/perf/table_config.js new file mode 100644 index 0000000000000..7bface6d2cdde --- /dev/null +++ b/js/perf/table_config.js @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const fs = require('fs'); +const path = require('path'); +const glob = require('glob'); + +const config = []; +const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); + +tests = [ + {col: 0, test: 'gteq', value: 0 }, + {col: 1, test: 'gteq', value: 0 }, + {col: 2, test: 'eq', value: 'Seattle'}, +] + +for (const filename of filenames) { + const { name } = path.parse(filename); + config.push({ name, buffers: [fs.readFileSync(filename)], tests }); +} + +module.exports = config; From 30f0330997602a9e817b536b2bfd0f8dbaf2aa4d Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Tue, 9 Jan 2018 17:24:25 -0500 Subject: [PATCH 03/19] Add basic DataFrame impl ... ... and a bunch of performance tests for various scanning approaches --- js/perf/index.js | 171 ++++++++++++++++++++++++++++------ js/perf/table_config.js | 2 +- js/src/Arrow.ts | 6 ++ js/src/dataframe/dataframe.ts | 109 ++++++++++++++++++++++ js/src/vector/virtual.ts | 3 + 5 files changed, 262 insertions(+), 29 deletions(-) create mode 100644 js/src/dataframe/dataframe.ts diff --git a/js/perf/index.js b/js/perf/index.js index 03501913d8155..95396a986de01 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,44 +16,52 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { Table, readVectors } = require('../targets/es5/umd'); -// const { Table, readVectors } = require('../targets/es5/cjs'); -const { Table, readVectors } = require('../targets/es2015/umd'); -// const { Table, readVectors } = require('../targets/es2015/cjs'); +// const { DataFrame, Table, readVectors } = require('../targets/es5/umd'); +// const { DataFrame, Table, readVectors } = require('../targets/es5/cjs'); +// const { DataFrame, Table, readVectors } = require('../targets/es2015/umd'); +const { DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); const config = require('./config'); const Benchmark = require('benchmark'); const suites = []; -for (let { name, buffers} of config) { - const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true }); - const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true }); - const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true }); - const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true }); - parseSuite.add(createFromTableTest(name, buffers)); - parseSuite.add(createReadVectorsTest(name, buffers)); - for (const vector of Table.from(buffers).columns) { - sliceSuite.add(createSliceTest(vector)); - iterateSuite.add(createIterateTest(vector)); - getByIndexSuite.add(createGetByIndexTest(vector)); - } - suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); -} +//for (let { name, buffers} of config) { +// const parseSuite = new Benchmark.Suite(`Parse "${name}"`, { async: true }); +// const sliceSuite = new Benchmark.Suite(`Slice "${name}" vectors`, { async: true }); +// const iterateSuite = new Benchmark.Suite(`Iterate "${name}" vectors`, { async: true }); +// const getByIndexSuite = new Benchmark.Suite(`Get "${name}" values by index`, { async: true }); +// parseSuite.add(createFromTableTest(name, buffers)); +// parseSuite.add(createReadVectorsTest(name, buffers)); +// for (const vector of Table.from(buffers).columns) { +// sliceSuite.add(createSliceTest(vector)); +// iterateSuite.add(createIterateTest(vector)); +// getByIndexSuite.add(createGetByIndexTest(vector)); +// } +// suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); +//} for (let {name, buffers, tests} of require('./table_config')) { - const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true }); - const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true }); - const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true }); + const tableIteratorSuite = new Benchmark.Suite(`Table Iterator "${name}"`, { async: true }); + const tableCountSuite = new Benchmark.Suite(`Table Count "${name}"`, { async: true }); + const dfIteratorSuite = new Benchmark.Suite(`DataFrame Iterator "${name}"`, { async: true }); + const dfIteratorCountSuite = new Benchmark.Suite(`DataFrame Iterator Count "${name}"`, { async: true }); + const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); + const dfScanCountSuite = new Benchmark.Suite(`DataFrame Scan Count "${name}"`, { async: true }); + const vectorCountSuite = new Benchmark.Suite(`Vector Count "${name}"`, { async: true }); const table = Table.from(buffers); - tableIterateSuite.add(createTableIterateTest(table)); + tableIteratorSuite.add(createTableIteratorTest(table)); + dfIteratorSuite.add(createDataFrameIteratorTest(table)); for (test of tests) { - tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value)) - vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value)) + tableCountSuite.add(createTableCountTest(table, test.col, test.test, test.value)) + dfIteratorCountSuite.add(createDataFrameIteratorCountTest(table, test.col, test.test, test.value)) + dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) + dfScanCountSuite.add(createDataFrameScanCountTest(table, test.col, test.test, test.value)) + vectorCountSuite.add(createVectorCountTest(table.columns[test.col], test.test, test.value)) } - suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite) + suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, vectorCountSuite) } console.log('Running apache-arrow performance tests...\n'); @@ -125,7 +133,7 @@ function createGetByIndexTest(vector) { }; } -function createVectorCountByTest(vector, test, value) { +function createVectorCountTest(vector, test, value) { let op; if (test == 'gteq') { op = function () { @@ -152,7 +160,7 @@ function createVectorCountByTest(vector, test, value) { }; } -function createTableIterateTest(table) { +function createTableIteratorTest(table) { let row; return { async: true, @@ -161,7 +169,7 @@ function createTableIterateTest(table) { }; } -function createTableCountByTest(table, column, test, value) { +function createTableCountTest(table, column, test, value) { let op; if (test == 'gteq') { op = function () { @@ -187,3 +195,110 @@ function createTableCountByTest(table, column, test, value) { fn: op }; } + +function createDataFrameIteratorTest(table) { + let df = DataFrame.from(table); + let idx; + return { + async: true, + name: `length: ${table.length}`, + fn() { for (idx of table) {} } + }; +} + +function createDataFrameDirectCountTest(table, column, test, value) { + let df = DataFrame.from(table); + + if (test == 'gteq') { + op = function () { + sum = 0; + for (let batch = -1; ++batch < df.lengths.length;) { + const length = df.lengths[batch]; + + // load batches + const columns = df.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + sum += (columns[column].get(idx) >= value); + } + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + for (let batch = -1; ++batch < df.lengths.length;) { + const length = df.lengths[batch]; + + // load batches + const columns = df.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + sum += (columns[column].get(idx) == value); + } + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn: op + }; +} + +function createDataFrameScanCountTest(table, column, test, value) { + let df = DataFrame.from(table); + + if (test == 'gteq') { + op = function () { + sum = 0; + df.scan((idx, cols)=>{sum += cols[column].get(idx) >= value}); + } + } else if (test == 'eq') { + op = function() { + sum = 0; + df.scan((idx, cols)=>{sum += cols[column].get(idx) == value}); + console.log(sum); + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn: op + }; +} + +function createDataFrameIteratorCountTest(table, column, test, value) { + let df = DataFrame.from(table); + + if (test == 'gteq') { + op = function () { + sum = 0; + for (idx of df) { + sum += (df.columns[column].get(idx) >= value); + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + for (idx of df) { + sum += (df.columns[column].get(idx) == value); + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn: op + }; +} diff --git a/js/perf/table_config.js b/js/perf/table_config.js index 7bface6d2cdde..06c9198353b80 100644 --- a/js/perf/table_config.js +++ b/js/perf/table_config.js @@ -25,7 +25,7 @@ const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.a tests = [ {col: 0, test: 'gteq', value: 0 }, {col: 1, test: 'gteq', value: 0 }, - {col: 2, test: 'eq', value: 'Seattle'}, + //{col: 2, test: 'eq', value: 'Seattle'}, ] for (const filename of filenames) { diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 3a8943434eece..a52deeb4992c0 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -45,12 +45,15 @@ import { TimestampVector, } from './vector/numeric'; +import { DataFrame } from './dataframe/dataframe'; + // closure compiler always erases static method names: // https://github.com/google/closure-compiler/issues/1776 // set them via string indexers to save them from the mangler Table['from'] = Table.from; Table['fromAsync'] = Table.fromAsync; BoolVector['pack'] = BoolVector.pack; +DataFrame['from'] = DataFrame.from; export { read, readAsync }; export { Table, Vector, StructRow }; @@ -84,6 +87,8 @@ export { FixedSizeListVector, }; +export { DataFrame } from './dataframe/dataframe'; + /* These exports are needed for the closure umd targets */ try { const Arrow = eval('exports'); @@ -93,6 +98,7 @@ try { Arrow['readAsync'] = readAsync; Arrow['Table'] = Table; Arrow['Vector'] = Vector; + Arrow['DataFrame'] = DataFrame; Arrow['StructRow'] = StructRow; Arrow['BoolVector'] = BoolVector; Arrow['ListVector'] = ListVector; diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts new file mode 100644 index 0000000000000..ed58f174aa425 --- /dev/null +++ b/js/src/dataframe/dataframe.ts @@ -0,0 +1,109 @@ +import { Vector } from "../vector/vector"; +import { StructVector } from "../vector/struct"; +import { VirtualVector } from "../vector/virtual"; + +export abstract class DataFrame { + public abstract columns: Vector[]; + public abstract getBatch(batch: number): Vector[]; + public abstract scan(next: (idx: number, cols: Vector[])=>void): void; + static from(table: Vector): DataFrame { + // There are two types of Vectors we might want to make into + // a ChunkedDataFrame: + // 1) a StructVector of all VirtualVectors + // 2) a VirtualVector of all StructVectors + if (table instanceof StructVector) { + if (table.columns.every((col) => col instanceof VirtualVector)) { + // ChunkedDataFrame case (1) + return new ChunkedDataFrame(table.columns as VirtualVector[]); + } else { + return new SimpleDataFrame(table.columns) + } + } else if (table instanceof VirtualVector && + table.vectors.every((vec) => vec instanceof StructVector)) { + const structs = table.vectors as StructVector[]; + const rest: StructVector[] = structs.slice(1); + const virtuals: VirtualVector[] = structs[0].columns.map((vec, col_idx) => { + return vec.concat(...rest.map((vec) => vec.columns[col_idx])); + }) as VirtualVector[]; + // ChunkedDataFrame case (2) + return new ChunkedDataFrame(virtuals); + } else { + return new SimpleDataFrame([table]); + } + } +} + +class SimpleDataFrame extends DataFrame { + readonly lengths: Uint32Array; + constructor(public columns: Vector[]) { + super(); + if (!this.columns.slice(1).every((v) => v.length === this.columns[0].length)) { + throw new Error("Attempted to create a DataFrame with un-aligned vectors"); + } + this.lengths = new Uint32Array([0, this.columns[0].length]); + } + + public getBatch() { + return this.columns; + } + + public scan(next: (idx: number, cols: Vector[])=>void) { + for (let idx = -1; ++idx < this.lengths[1];) { + next(idx, this.columns) + } + } + + *[Symbol.iterator]() { + for (let idx = -1; ++idx < this.lengths[1];) { + yield idx; + } + } +} + +class ChunkedDataFrame extends DataFrame { + public columns: Vector[]; + readonly lengths: Uint32Array; + constructor(private virtuals: VirtualVector[]) { + super(); + const offsets = virtuals[0].offsets; + if (!this.virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) { + throw new Error("Attempted to create a DataFrame with un-aligned vectors"); + } + this.lengths = new Uint32Array(offsets.length); + offsets.forEach((offset, i) => { + this.lengths[i] = offsets[i+1] - offset;; + }); + } + + getBatch(batch: number): Vector[] { + return this.virtuals.map((virt) => virt.vectors[batch]); + } + + scan(next: (idx: number, cols: Vector[])=>void) { + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + next(idx, columns) + } + } + } + + *[Symbol.iterator]() { + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + this.columns = this.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + yield idx; + } + } + } +} diff --git a/js/src/vector/virtual.ts b/js/src/vector/virtual.ts index 42db78706db51..9dec75254595f 100644 --- a/js/src/vector/virtual.ts +++ b/js/src/vector/virtual.ts @@ -115,6 +115,9 @@ export class VirtualVector implements Vector { } return new ArrayType(0); } + aligned(other: VirtualVector): boolean { + return this.offsets.every((offset, i) => offset === other.offsets[i]); + } } function arraySet(source: T[], target: T[], index: number) { From 796f45dda65f049dc5f75c0564130dca4d733e71 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 10 Jan 2018 11:44:45 -0500 Subject: [PATCH 04/19] add DataFrame filter and count ops --- js/perf/index.js | 23 +++++++- js/src/dataframe/dataframe.ts | 98 +++++++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 16 deletions(-) diff --git a/js/perf/index.js b/js/perf/index.js index 95396a986de01..74dbd872d8a6e 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -48,6 +48,7 @@ for (let {name, buffers, tests} of require('./table_config')) { const dfIteratorCountSuite = new Benchmark.Suite(`DataFrame Iterator Count "${name}"`, { async: true }); const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); const dfScanCountSuite = new Benchmark.Suite(`DataFrame Scan Count "${name}"`, { async: true }); + const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter Scan Count "${name}"`, { async: true }); const vectorCountSuite = new Benchmark.Suite(`Vector Count "${name}"`, { async: true }); const table = Table.from(buffers); @@ -58,10 +59,11 @@ for (let {name, buffers, tests} of require('./table_config')) { dfIteratorCountSuite.add(createDataFrameIteratorCountTest(table, test.col, test.test, test.value)) dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) dfScanCountSuite.add(createDataFrameScanCountTest(table, test.col, test.test, test.value)) + dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value)) vectorCountSuite.add(createVectorCountTest(table.columns[test.col], test.test, test.value)) } - suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, vectorCountSuite) + suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, dfFilterCountSuite, vectorCountSuite) } console.log('Running apache-arrow performance tests...\n'); @@ -275,6 +277,25 @@ function createDataFrameScanCountTest(table, column, test, value) { }; } +function createDataFrameFilterCountTest(table, column, test, value) { + let df = DataFrame.from(table); + if (test == 'gteq') { + df = df.filter((idx, cols)=>cols[column].get(idx) >= value); + } else if (test == 'eq') { + df = df.filter((idx, cols)=>cols[column].get(idx) == value); + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn() { + df.count(); + } + }; +} + function createDataFrameIteratorCountTest(table, column, test, value) { let df = DataFrame.from(table); diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts index ed58f174aa425..dc3dd78156b9e 100644 --- a/js/src/dataframe/dataframe.ts +++ b/js/src/dataframe/dataframe.ts @@ -2,10 +2,18 @@ import { Vector } from "../vector/vector"; import { StructVector } from "../vector/struct"; import { VirtualVector } from "../vector/virtual"; +export type NextFunc = (idx: number, cols: Vector[]) => void; +export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; + export abstract class DataFrame { + constructor(readonly lengths: Uint32Array) {} public abstract columns: Vector[]; public abstract getBatch(batch: number): Vector[]; - public abstract scan(next: (idx: number, cols: Vector[])=>void): void; + public abstract scan(next: NextFunc): void; + public filter(predicate: PredicateFunc): DataFrame { + return new FilteredDataFrame(this, predicate); + } + static from(table: Vector): DataFrame { // There are two types of Vectors we might want to make into // a ChunkedDataFrame: @@ -31,23 +39,26 @@ export abstract class DataFrame { return new SimpleDataFrame([table]); } } + + count(): number { + return this.lengths.reduce((acc, val) => acc + val); + } } class SimpleDataFrame extends DataFrame { readonly lengths: Uint32Array; constructor(public columns: Vector[]) { - super(); + super(new Uint32Array([0, columns[0].length])); if (!this.columns.slice(1).every((v) => v.length === this.columns[0].length)) { throw new Error("Attempted to create a DataFrame with un-aligned vectors"); } - this.lengths = new Uint32Array([0, this.columns[0].length]); } public getBatch() { return this.columns; } - public scan(next: (idx: number, cols: Vector[])=>void) { + public scan(next: NextFunc) { for (let idx = -1; ++idx < this.lengths[1];) { next(idx, this.columns) } @@ -62,24 +73,16 @@ class SimpleDataFrame extends DataFrame { class ChunkedDataFrame extends DataFrame { public columns: Vector[]; - readonly lengths: Uint32Array; constructor(private virtuals: VirtualVector[]) { - super(); - const offsets = virtuals[0].offsets; - if (!this.virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) { - throw new Error("Attempted to create a DataFrame with un-aligned vectors"); - } - this.lengths = new Uint32Array(offsets.length); - offsets.forEach((offset, i) => { - this.lengths[i] = offsets[i+1] - offset;; - }); + super(ChunkedDataFrame.getLengths(virtuals)); + this.virtuals = virtuals; } getBatch(batch: number): Vector[] { return this.virtuals.map((virt) => virt.vectors[batch]); } - scan(next: (idx: number, cols: Vector[])=>void) { + scan(next: NextFunc) { for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; @@ -106,4 +109,69 @@ class ChunkedDataFrame extends DataFrame { } } } + + private static getLengths(virtuals: VirtualVector[]): Uint32Array { + if (!virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) { + throw new Error("Attempted to create a DataFrame with un-aligned vectors"); + } + return new Uint32Array(virtuals[0].vectors.map((v)=>v.length)); + } +} + +class FilteredDataFrame extends DataFrame { + public columns: Vector[]; + constructor (readonly parent: DataFrame, private predicate: PredicateFunc) { + super(parent.lengths); + } + + getBatch(batch: number): Vector[] { + return this.parent.getBatch(batch); + }; + + scan(next: NextFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; + + // load batches + const columns = this.parent.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + if (this.predicate(idx, columns)) next(idx, columns); + } + } + } + + count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; + + // load batches + const columns = this.parent.getBatch(batch); + + // yield all indices + for (let idx = -1; ++idx < length;) { + if (this.predicate(idx, columns)) ++sum; + } + } + return sum; + } + + filter(predicate: PredicateFunc): DataFrame { + return new FilteredDataFrame( + this.parent, + (idx, cols) => this.predicate(idx, cols) && predicate(idx, cols) + ); + } } From 4d9e8c0667a44f6e00bb882ff543eb290b0d7644 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 10 Jan 2018 13:36:29 -0500 Subject: [PATCH 05/19] Add concept of predicates for filtering dataframes --- js/perf/index.js | 12 +-- js/src/Arrow.ts | 5 + js/src/dataframe/dataframe.ts | 17 ++-- js/src/dataframe/predicate.ts | 171 ++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+), 13 deletions(-) create mode 100644 js/src/dataframe/predicate.ts diff --git a/js/perf/index.js b/js/perf/index.js index 74dbd872d8a6e..b5789e8b34c07 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,10 +16,10 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { DataFrame, Table, readVectors } = require('../targets/es5/umd'); -// const { DataFrame, Table, readVectors } = require('../targets/es5/cjs'); -// const { DataFrame, Table, readVectors } = require('../targets/es2015/umd'); -const { DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); +// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/umd'); +// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/cjs'); +// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/umd'); +const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); const config = require('./config'); const Benchmark = require('benchmark'); @@ -280,9 +280,9 @@ function createDataFrameScanCountTest(table, column, test, value) { function createDataFrameFilterCountTest(table, column, test, value) { let df = DataFrame.from(table); if (test == 'gteq') { - df = df.filter((idx, cols)=>cols[column].get(idx) >= value); + df = df.filter(col(table.columns[column].name).gteq(value)); } else if (test == 'eq') { - df = df.filter((idx, cols)=>cols[column].get(idx) == value); + df = df.filter(col(table.columns[column].name).eq(value)); } else { throw new Error(`Unrecognized test "${test}"`); } diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index a52deeb4992c0..ce7235b8b13d4 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -46,6 +46,7 @@ import { } from './vector/numeric'; import { DataFrame } from './dataframe/dataframe'; +import { lit, col } from './dataframe/predicate'; // closure compiler always erases static method names: // https://github.com/google/closure-compiler/issues/1776 @@ -88,12 +89,16 @@ export { }; export { DataFrame } from './dataframe/dataframe'; +export { lit, col } from './dataframe/predicate'; + /* These exports are needed for the closure umd targets */ try { const Arrow = eval('exports'); if (typeof Arrow === 'object') { // string indexers tell closure compiler not to rename these properties + Arrow['lit'] = lit; + Arrow['col'] = col; Arrow['read'] = read; Arrow['readAsync'] = readAsync; Arrow['Table'] = Table; diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts index dc3dd78156b9e..0dbb19bbbdd4e 100644 --- a/js/src/dataframe/dataframe.ts +++ b/js/src/dataframe/dataframe.ts @@ -2,15 +2,16 @@ import { Vector } from "../vector/vector"; import { StructVector } from "../vector/struct"; import { VirtualVector } from "../vector/virtual"; +import { Predicate } from "./predicate" + export type NextFunc = (idx: number, cols: Vector[]) => void; -export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; export abstract class DataFrame { constructor(readonly lengths: Uint32Array) {} public abstract columns: Vector[]; public abstract getBatch(batch: number): Vector[]; public abstract scan(next: NextFunc): void; - public filter(predicate: PredicateFunc): DataFrame { + public filter(predicate: Predicate): DataFrame { return new FilteredDataFrame(this, predicate); } @@ -120,7 +121,7 @@ class ChunkedDataFrame extends DataFrame { class FilteredDataFrame extends DataFrame { public columns: Vector[]; - constructor (readonly parent: DataFrame, private predicate: PredicateFunc) { + constructor (readonly parent: DataFrame, private predicate: Predicate) { super(parent.lengths); } @@ -138,10 +139,11 @@ class FilteredDataFrame extends DataFrame { // load batches const columns = this.parent.getBatch(batch); + const predicate = this.predicate.bind(columns); // yield all indices for (let idx = -1; ++idx < length;) { - if (this.predicate(idx, columns)) next(idx, columns); + if (predicate(idx, columns)) next(idx, columns); } } } @@ -159,19 +161,20 @@ class FilteredDataFrame extends DataFrame { // load batches const columns = this.parent.getBatch(batch); + const predicate = this.predicate.bind(columns); // yield all indices for (let idx = -1; ++idx < length;) { - if (this.predicate(idx, columns)) ++sum; + if (predicate(idx, columns)) ++sum; } } return sum; } - filter(predicate: PredicateFunc): DataFrame { + filter(predicate: Predicate): DataFrame { return new FilteredDataFrame( this.parent, - (idx, cols) => this.predicate(idx, cols) && predicate(idx, cols) + this.predicate.and(predicate) ); } } diff --git a/js/src/dataframe/predicate.ts b/js/src/dataframe/predicate.ts new file mode 100644 index 0000000000000..4438c0adbaa98 --- /dev/null +++ b/js/src/dataframe/predicate.ts @@ -0,0 +1,171 @@ +import { Vector } from "../vector/vector"; + +export type ValueFunc = (idx: number, cols: Vector[]) => T|null; +export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; + +export abstract class Value { + eq(other: Value|T): Predicate { + if (!(other instanceof Value)) other = new Literal(other); + return new Equals(this, other); + } + lteq(other: Value|T): Predicate { + if (!(other instanceof Value)) other = new Literal(other); + return new LTeq(this, other); + } + gteq(other: Value|T): Predicate { + if (!(other instanceof Value)) other = new Literal(other); + return new GTeq(this, other); + } +} + +class Literal extends Value { + constructor(public v: T) { super(); } +} + +class Col extends Value { + vector: Vector; + colidx: number; + + constructor(public name: string) { super(); } + bind(cols: Vector[]) { + if (!this.colidx) { + // Assume column index doesn't change between calls to bind + //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); + this.colidx = -1; + for (let idx = -1; ++idx < cols.length;) { + if (cols[idx].name === this.name) { + this.colidx = idx; + break; + } + } + if (this.colidx < 0) throw new Error(`Failed to bind Col "${this.name}"`) + } + this.vector = cols[this.colidx] + return this.vector.get.bind(this.vector); + } + + emitString() { return `cols[${this.colidx}].get(idx)`; } +} + +export abstract class Predicate { + abstract bind(cols: Vector[]): PredicateFunc; + and(expr: Predicate): Predicate { return new And(this, expr); } + or(expr: Predicate): Predicate { return new Or(this, expr); } + ands(): Predicate[] { return [this]; } +} + +abstract class ComparisonPredicate extends Predicate { + constructor(public readonly left: Value, public readonly right: Value) { + super(); + } + + bind(cols: Vector[]) { + if (this.left instanceof Literal) { + if (this.right instanceof Literal) { + return this._bindLitLit(cols, this.left, this.right); + } else { // right is a Col + + return this._bindColLit(cols, this.right as Col, this.left); + } + } else { // left is a Col + if (this.right instanceof Literal) { + return this._bindColLit(cols, this.left as Col, this.right); + } else { // right is a Col + return this._bindColCol(cols, this.left as Col, this.right as Col); + } + } + } + + protected abstract _bindLitLit(cols: Vector[], left: Literal, right: Literal): PredicateFunc; + protected abstract _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc; + protected abstract _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc; +} + +abstract class CombinationPredicate extends Predicate { + constructor(public readonly left: Predicate, public readonly right: Predicate) { + super(); + } +} + +class And extends CombinationPredicate { + bind(cols: Vector[]) { + const left = this.left.bind(cols); + const right = this.right.bind(cols); + return (idx: number, cols: Vector[]) => left(idx, cols) && right(idx, cols); + } + ands() : Predicate[] { return this.left.ands().concat(this.right.ands()); } +} + +class Or extends CombinationPredicate { + bind(cols: Vector[]) { + const left = this.left.bind(cols); + const right = this.right.bind(cols); + return (idx: number, cols: Vector[]) => left(idx, cols) || right(idx, cols); + } +} + +class Equals extends ComparisonPredicate { + protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v == right.v; + return () => rtrn; + } + + protected _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc { + const left_func = left.bind(cols); + const right_func = right.bind(cols); + return (idx: number, cols: Vector[]) => left_func(idx, cols) == right_func(idx, cols); + } + + protected _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc { + const col_func = col.bind(cols); + return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v; + } +} + +class LTeq extends ComparisonPredicate { + protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v <= right.v; + return () => rtrn; + } + + protected _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc { + const left_func = left.bind(cols); + const right_func = right.bind(cols); + return (idx: number, cols: Vector[]) => left_func(idx, cols) <= right_func(idx, cols); + } + + protected _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc { + const col_func = col.bind(cols); + return (idx: number, cols: Vector[]) => col_func(idx, cols) <= lit.v; + } +} + +class GTeq extends ComparisonPredicate { + protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v >= right.v; + return () => rtrn; + } + + protected _bindColCol(cols: Vector[], left: Col, right: Col): PredicateFunc { + const left_func = left.bind(cols); + const right_func = right.bind(cols); + return (idx: number, cols: Vector[]) => left_func(idx, cols) >= right_func(idx, cols); + } + + protected _bindColLit(cols: Vector[], col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(cols); + return (idx: number, cols: Vector[]) => col_func(idx, cols) >= lit.v; + } + //eval(idx: number, cols: Vector[]) { + // return this.left.eval(idx, cols) >= this.right.eval(idx, cols); + //} + //emitString() { + // return `${this.left.emitString()} >= ${this.right.emitString()}` + //} + //createDictionaryEval(schema, lit: Literal, col: Col): (idx: number, cols: Vector[]) => boolean { + // return this.eval; + //} +} + +export function lit(n: number): Value { return new Literal(n); } +export function col(n: string): Value { return new Col(n); } From aa999f87f16456145c29ac7c3ea21e13a9bdf11b Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 10 Jan 2018 14:38:49 -0500 Subject: [PATCH 06/19] Add DictionaryVector optimization for equals predicate --- js/perf/table_config.js | 2 +- js/src/dataframe/predicate.ts | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/js/perf/table_config.js b/js/perf/table_config.js index 06c9198353b80..7bface6d2cdde 100644 --- a/js/perf/table_config.js +++ b/js/perf/table_config.js @@ -25,7 +25,7 @@ const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.a tests = [ {col: 0, test: 'gteq', value: 0 }, {col: 1, test: 'gteq', value: 0 }, - //{col: 2, test: 'eq', value: 'Seattle'}, + {col: 2, test: 'eq', value: 'Seattle'}, ] for (const filename of filenames) { diff --git a/js/src/dataframe/predicate.ts b/js/src/dataframe/predicate.ts index 4438c0adbaa98..263b8646d71fc 100644 --- a/js/src/dataframe/predicate.ts +++ b/js/src/dataframe/predicate.ts @@ -1,4 +1,5 @@ import { Vector } from "../vector/vector"; +import { DictionaryVector } from "../vector/dictionary"; export type ValueFunc = (idx: number, cols: Vector[]) => T|null; export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; @@ -118,7 +119,30 @@ class Equals extends ComparisonPredicate { protected _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc { const col_func = col.bind(cols); - return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v; + if (col.vector instanceof DictionaryVector) { + // Assume that there is only one key with the value `lit.v` + let key = -1 + for (; ++key < col.vector.data.length;) { + if (col.vector.data.get(key) === lit.v) { + break; + } + } + + if (key == col.vector.data.length) { + // the value doesn't exist in the dictionary - always return + // false + // TODO: special-case of PredicateFunc that encapsulates this + // "always false" behavior. That way filtering operations don't + // have to bother checking + return () => false; + } else { + return (idx: number) => { + return (col.vector as DictionaryVector).getKey(idx) === key; + } + } + } else { + return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v; + } } } From 2744c63c7e699a2acead253d8b903bbe70409738 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 10 Jan 2018 16:20:44 -0500 Subject: [PATCH 07/19] Remove Chunked/Simple DataFrame distinction --- js/src/dataframe/dataframe.ts | 149 ++++++++++++++-------------------- 1 file changed, 61 insertions(+), 88 deletions(-) diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts index 0dbb19bbbdd4e..88923effd6f77 100644 --- a/js/src/dataframe/dataframe.ts +++ b/js/src/dataframe/dataframe.ts @@ -6,81 +6,25 @@ import { Predicate } from "./predicate" export type NextFunc = (idx: number, cols: Vector[]) => void; -export abstract class DataFrame { - constructor(readonly lengths: Uint32Array) {} - public abstract columns: Vector[]; - public abstract getBatch(batch: number): Vector[]; - public abstract scan(next: NextFunc): void; - public filter(predicate: Predicate): DataFrame { - return new FilteredDataFrame(this, predicate); - } - - static from(table: Vector): DataFrame { - // There are two types of Vectors we might want to make into - // a ChunkedDataFrame: - // 1) a StructVector of all VirtualVectors - // 2) a VirtualVector of all StructVectors - if (table instanceof StructVector) { - if (table.columns.every((col) => col instanceof VirtualVector)) { - // ChunkedDataFrame case (1) - return new ChunkedDataFrame(table.columns as VirtualVector[]); - } else { - return new SimpleDataFrame(table.columns) - } - } else if (table instanceof VirtualVector && - table.vectors.every((vec) => vec instanceof StructVector)) { - const structs = table.vectors as StructVector[]; - const rest: StructVector[] = structs.slice(1); - const virtuals: VirtualVector[] = structs[0].columns.map((vec, col_idx) => { - return vec.concat(...rest.map((vec) => vec.columns[col_idx])); - }) as VirtualVector[]; - // ChunkedDataFrame case (2) - return new ChunkedDataFrame(virtuals); - } else { - return new SimpleDataFrame([table]); - } - } - - count(): number { - return this.lengths.reduce((acc, val) => acc + val); - } -} - -class SimpleDataFrame extends DataFrame { +export class DataFrame { readonly lengths: Uint32Array; - constructor(public columns: Vector[]) { - super(new Uint32Array([0, columns[0].length])); - if (!this.columns.slice(1).every((v) => v.length === this.columns[0].length)) { - throw new Error("Attempted to create a DataFrame with un-aligned vectors"); - } - } - - public getBatch() { - return this.columns; - } - - public scan(next: NextFunc) { - for (let idx = -1; ++idx < this.lengths[1];) { - next(idx, this.columns) - } - } - - *[Symbol.iterator]() { - for (let idx = -1; ++idx < this.lengths[1];) { - yield idx; - } - } -} - -class ChunkedDataFrame extends DataFrame { public columns: Vector[]; - constructor(private virtuals: VirtualVector[]) { - super(ChunkedDataFrame.getLengths(virtuals)); - this.virtuals = virtuals; + constructor(readonly batches: Vector[][]) { + // for each batch + this.lengths = new Uint32Array(batches.map((batch)=>{ + // verify that every vector has the same length, and return that + // length + // throw an error if the lengths don't match + return batch.reduce((length, col) => { + if (col.length !== length) + throw new Error("Attempted to create a DataFrame with un-aligned vectors"); + return length; + }, batch[0].length); + })); } - getBatch(batch: number): Vector[] { - return this.virtuals.map((virt) => virt.vectors[batch]); + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame(this, predicate); } scan(next: NextFunc) { @@ -88,7 +32,7 @@ class ChunkedDataFrame extends DataFrame { const length = this.lengths[batch]; // load batches - const columns = this.getBatch(batch); + const columns = this.batches[batch]; // yield all indices for (let idx = -1; ++idx < length;) { @@ -97,12 +41,16 @@ class ChunkedDataFrame extends DataFrame { } } + count(): number { + return this.lengths.reduce((acc, val) => acc + val); + } + *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; // load batches - this.columns = this.getBatch(batch); + this.columns = this.batches[batch]; // yield all indices for (let idx = -1; ++idx < length;) { @@ -111,34 +59,48 @@ class ChunkedDataFrame extends DataFrame { } } - private static getLengths(virtuals: VirtualVector[]): Uint32Array { - if (!virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) { - throw new Error("Attempted to create a DataFrame with un-aligned vectors"); + static from(table: Vector): DataFrame { + if (table instanceof StructVector) { + const columns = table.columns; + if (isAligned(columns)) { + // StructVector of aligned VirtualVectors + // break up VirtualVectors into batches + const batches = columns[0].vectors.map((_,i) => { + return columns.map((vec: VirtualVector) => { + return vec.vectors[i]; + }); + }); + return new DataFrame(batches); + } else { + return new DataFrame([columns]); + } + } else if (table instanceof VirtualVector && + table.vectors.every((vec) => vec instanceof StructVector)) { + return new DataFrame(table.vectors.map((vec) => { + return (vec as StructVector).columns; + })); + } else { + return new DataFrame([[table]]); } - return new Uint32Array(virtuals[0].vectors.map((v)=>v.length)); } } class FilteredDataFrame extends DataFrame { public columns: Vector[]; constructor (readonly parent: DataFrame, private predicate: Predicate) { - super(parent.lengths); + super(parent.batches); } - getBatch(batch: number): Vector[] { - return this.parent.getBatch(batch); - }; - scan(next: NextFunc) { // inlined version of this: // this.parent.scan((idx, columns) => { // if (this.predicate(idx, columns)) next(idx, columns); // }); - for (let batch = -1; ++batch < this.parent.lengths.length;) { - const length = this.parent.lengths[batch]; + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; // load batches - const columns = this.parent.getBatch(batch); + const columns = this.batches[batch]; const predicate = this.predicate.bind(columns); // yield all indices @@ -156,11 +118,11 @@ class FilteredDataFrame extends DataFrame { // }); // return sum; let sum = 0; - for (let batch = -1; ++batch < this.parent.lengths.length;) { - const length = this.parent.lengths[batch]; + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; // load batches - const columns = this.parent.getBatch(batch); + const columns = this.batches[batch]; const predicate = this.predicate.bind(columns); // yield all indices @@ -178,3 +140,14 @@ class FilteredDataFrame extends DataFrame { ); } } + +function isAligned(columns: Vector[]): columns is VirtualVector[] { + if (columns.every((col) => col instanceof VirtualVector)) { + const virtuals = columns as VirtualVector[] + + return virtuals.slice(1).every((col) => { + return col.aligned(virtuals[0]); + }); + } + return false; +} From 6a41d6872c3fe47b0413c1266bdcf9339aa8bf73 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 15:11:09 -0500 Subject: [PATCH 08/19] clean up table benchmarks --- js/package.json | 1 + js/perf/index.js | 199 +++++--------------------- js/perf/table_config.js | 20 ++- js/src/Arrow.externs.ts | 16 +++ js/{ => test/data/tables}/generate.py | 0 5 files changed, 63 insertions(+), 173 deletions(-) rename js/{ => test/data/tables}/generate.py (100%) diff --git a/js/package.json b/js/package.json index d68e7a6279e61..1f59ac1ef98d2 100644 --- a/js/package.json +++ b/js/package.json @@ -12,6 +12,7 @@ "clean": "gulp clean", "debug": "gulp debug", "perf": "node ./perf/index.js", + "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", "release": "./npm-release.sh", "clean:all": "run-p clean clean:testdata", "clean:testdata": "gulp clean:testdata", diff --git a/js/perf/index.js b/js/perf/index.js index b5789e8b34c07..5ab3e76b24d7a 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,54 +16,42 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/umd'); -// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/cjs'); -// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/umd'); -const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); +const { col, DataFrame, Table, readVectors } = require('../targets/es5/umd'); +// const { col, DataFrame, Table, readVectors } = require('../targets/es5/cjs'); +// const { col, DataFrame, Table, readVectors } = require('../targets/es2015/umd'); +// const { col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); const config = require('./config'); const Benchmark = require('benchmark'); const suites = []; -//for (let { name, buffers} of config) { -// const parseSuite = new Benchmark.Suite(`Parse "${name}"`, { async: true }); -// const sliceSuite = new Benchmark.Suite(`Slice "${name}" vectors`, { async: true }); -// const iterateSuite = new Benchmark.Suite(`Iterate "${name}" vectors`, { async: true }); -// const getByIndexSuite = new Benchmark.Suite(`Get "${name}" values by index`, { async: true }); -// parseSuite.add(createFromTableTest(name, buffers)); -// parseSuite.add(createReadVectorsTest(name, buffers)); -// for (const vector of Table.from(buffers).columns) { -// sliceSuite.add(createSliceTest(vector)); -// iterateSuite.add(createIterateTest(vector)); -// getByIndexSuite.add(createGetByIndexTest(vector)); -// } -// suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); -//} +for (let { name, buffers} of config) { + const parseSuite = new Benchmark.Suite(`Parse "${name}"`, { async: true }); + const sliceSuite = new Benchmark.Suite(`Slice "${name}" vectors`, { async: true }); + const iterateSuite = new Benchmark.Suite(`Iterate "${name}" vectors`, { async: true }); + const getByIndexSuite = new Benchmark.Suite(`Get "${name}" values by index`, { async: true }); + parseSuite.add(createFromTableTest(name, buffers)); + parseSuite.add(createReadVectorsTest(name, buffers)); + for (const vector of Table.from(buffers).columns) { + sliceSuite.add(createSliceTest(vector)); + iterateSuite.add(createIterateTest(vector)); + getByIndexSuite.add(createGetByIndexTest(vector)); + } + suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); +} for (let {name, buffers, tests} of require('./table_config')) { - const tableIteratorSuite = new Benchmark.Suite(`Table Iterator "${name}"`, { async: true }); - const tableCountSuite = new Benchmark.Suite(`Table Count "${name}"`, { async: true }); - const dfIteratorSuite = new Benchmark.Suite(`DataFrame Iterator "${name}"`, { async: true }); - const dfIteratorCountSuite = new Benchmark.Suite(`DataFrame Iterator Count "${name}"`, { async: true }); + const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter-Scan Count "${name}"`, { async: true }); const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); - const dfScanCountSuite = new Benchmark.Suite(`DataFrame Scan Count "${name}"`, { async: true }); - const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter Scan Count "${name}"`, { async: true }); - const vectorCountSuite = new Benchmark.Suite(`Vector Count "${name}"`, { async: true }); const table = Table.from(buffers); - tableIteratorSuite.add(createTableIteratorTest(table)); - dfIteratorSuite.add(createDataFrameIteratorTest(table)); for (test of tests) { - tableCountSuite.add(createTableCountTest(table, test.col, test.test, test.value)) - dfIteratorCountSuite.add(createDataFrameIteratorCountTest(table, test.col, test.test, test.value)) - dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) - dfScanCountSuite.add(createDataFrameScanCountTest(table, test.col, test.test, test.value)) dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value)) - vectorCountSuite.add(createVectorCountTest(table.columns[test.col], test.test, test.value)) + dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) } - suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, dfFilterCountSuite, vectorCountSuite) + suites.push(dfFilterCountSuite, dfDirectCountSuite) } console.log('Running apache-arrow performance tests...\n'); @@ -135,81 +123,9 @@ function createGetByIndexTest(vector) { }; } -function createVectorCountTest(vector, test, value) { - let op; - if (test == 'gteq') { - op = function () { - sum = 0; - for (cell of vector) { - sum += (cell >= value) - } - } - } else if (test == 'eq') { - op = function () { - sum = 0; - for (cell of vector) { - sum += (cell == value) - } - } - } else { - throw new Error(`Unrecognized test "$test"`); - } - - return { - async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}, test: ${test}, value: ${value}`, - fn: op - }; -} - -function createTableIteratorTest(table) { - let row; - return { - async: true, - name: `length: ${table.length}`, - fn() { for (row of table) {} } - }; -} - -function createTableCountTest(table, column, test, value) { - let op; - if (test == 'gteq') { - op = function () { - sum = 0; - for (row of table) { - sum += (row.get(column) >= value) - } - } - } else if (test == 'eq') { - op = function() { - sum = 0; - for (row of table) { - sum += (row.get(column) == value) - } - } - } else { - throw new Error(`Unrecognized test "${test}"`); - } - - return { - async: true, - name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, - fn: op - }; -} - -function createDataFrameIteratorTest(table) { - let df = DataFrame.from(table); - let idx; - return { - async: true, - name: `length: ${table.length}`, - fn() { for (idx of table) {} } - }; -} - function createDataFrameDirectCountTest(table, column, test, value) { let df = DataFrame.from(table); + let colidx = table.columns.findIndex((c)=>c.name === column); if (test == 'gteq') { op = function () { @@ -218,11 +134,11 @@ function createDataFrameDirectCountTest(table, column, test, value) { const length = df.lengths[batch]; // load batches - const columns = df.getBatch(batch); + const columns = df.batches[batch]; // yield all indices for (let idx = -1; ++idx < length;) { - sum += (columns[column].get(idx) >= value); + sum += (columns[colidx].get(idx) >= value); } } } @@ -233,11 +149,11 @@ function createDataFrameDirectCountTest(table, column, test, value) { const length = df.lengths[batch]; // load batches - const columns = df.getBatch(batch); + const columns = df.batches[batch] // yield all indices for (let idx = -1; ++idx < length;) { - sum += (columns[column].get(idx) == value); + sum += (columns[colidx].get(idx) == value); } } } @@ -247,79 +163,28 @@ function createDataFrameDirectCountTest(table, column, test, value) { return { async: true, - name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, - fn: op - }; -} - -function createDataFrameScanCountTest(table, column, test, value) { - let df = DataFrame.from(table); - - if (test == 'gteq') { - op = function () { - sum = 0; - df.scan((idx, cols)=>{sum += cols[column].get(idx) >= value}); - } - } else if (test == 'eq') { - op = function() { - sum = 0; - df.scan((idx, cols)=>{sum += cols[column].get(idx) == value}); - console.log(sum); - } - } else { - throw new Error(`Unrecognized test "${test}"`); - } - - return { - async: true, - name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}`, fn: op }; } function createDataFrameFilterCountTest(table, column, test, value) { let df = DataFrame.from(table); + let colidx = table.columns.findIndex((c)=>c.name === column); + if (test == 'gteq') { - df = df.filter(col(table.columns[column].name).gteq(value)); + df = df.filter(col(column).gteq(value)); } else if (test == 'eq') { - df = df.filter(col(table.columns[column].name).eq(value)); + df = df.filter(col(column).eq(value)); } else { throw new Error(`Unrecognized test "${test}"`); } return { async: true, - name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}`, fn() { df.count(); } }; } - -function createDataFrameIteratorCountTest(table, column, test, value) { - let df = DataFrame.from(table); - - if (test == 'gteq') { - op = function () { - sum = 0; - for (idx of df) { - sum += (df.columns[column].get(idx) >= value); - } - } - } else if (test == 'eq') { - op = function() { - sum = 0; - for (idx of df) { - sum += (df.columns[column].get(idx) == value); - } - } - } else { - throw new Error(`Unrecognized test "${test}"`); - } - - return { - async: true, - name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, - fn: op - }; -} diff --git a/js/perf/table_config.js b/js/perf/table_config.js index 7bface6d2cdde..3c045e4571e44 100644 --- a/js/perf/table_config.js +++ b/js/perf/table_config.js @@ -22,15 +22,23 @@ const glob = require('glob'); const config = []; const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); -tests = [ - {col: 0, test: 'gteq', value: 0 }, - {col: 1, test: 'gteq', value: 0 }, - {col: 2, test: 'eq', value: 'Seattle'}, -] +tests = { + "tracks": [ + {col: 'lat', test: 'gteq', value: 0 }, + {col: 'lng', test: 'gteq', value: 0 }, + {col: 'origin', test: 'eq', value: 'Seattle'}, + ] +} for (const filename of filenames) { const { name } = path.parse(filename); - config.push({ name, buffers: [fs.readFileSync(filename)], tests }); + if (name in tests) { + config.push({ + name, + buffers: [fs.readFileSync(filename)], + tests: tests[name] + }); + } } module.exports = config; diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index c23930271183d..d3bfdbbf8e123 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -82,3 +82,19 @@ let DictionaryVector = function() {}; DictionaryVector.prototype.getKey; /** @type {?} */ DictionaryVector.prototype.getValue; + +let DataFrame = function () {}; +/** @type {?} */ +DataFrame.prototype.lengths; +/** @type {?} */ +DataFrame.prototype.columns; +/** @type {?} */ +DataFrame.prototype.batches; + +let Col = function() {}; +/** @type {?} */ +Col.prototype.gteq; +/** @type {?} */ +Col.prototype.lteq; +/** @type {?} */ +Col.prototype.eq; diff --git a/js/generate.py b/js/test/data/tables/generate.py similarity index 100% rename from js/generate.py rename to js/test/data/tables/generate.py From e8979ba5e44bfc3e66befec8a74368b6d3a08416 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 15:12:18 -0500 Subject: [PATCH 09/19] Refactor DataFrame to extend Vector --- js/src/dataframe/dataframe.ts | 50 ++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts index 88923effd6f77..c8db286f28eb4 100644 --- a/js/src/dataframe/dataframe.ts +++ b/js/src/dataframe/dataframe.ts @@ -1,15 +1,29 @@ import { Vector } from "../vector/vector"; -import { StructVector } from "../vector/struct"; +import { StructVector, StructRow } from "../vector/struct"; import { VirtualVector } from "../vector/virtual"; import { Predicate } from "./predicate" export type NextFunc = (idx: number, cols: Vector[]) => void; -export class DataFrame { +export class DataFrameRow extends StructRow { + constructor (batches: Vector[], idx: number) { + super(new StructVector({columns: batches}), idx); + } +} + +export interface DataFrameOps { + readonly batches: Vector[][]; + readonly lengths: Uint32Array; + filter(predicate: Predicate): DataFrameOps; + scan(next: NextFunc): void; + count(): number; +} + +export class DataFrame extends Vector implements DataFrameOps { readonly lengths: Uint32Array; - public columns: Vector[]; - constructor(readonly batches: Vector[][]) { + constructor(readonly batches: Vector[][]) { + super(); // for each batch this.lengths = new Uint32Array(batches.map((batch)=>{ // verify that every vector has the same length, and return that @@ -23,7 +37,17 @@ export class DataFrame { })); } - public filter(predicate: Predicate): DataFrame { + get(idx: number): DataFrameRow|null { + let batch = 0; + while (idx > this.lengths[batch] && batch < this.lengths.length) + idx -= this.lengths[batch++]; + + if (batch === this.lengths.length) return null; + + else return new DataFrameRow(this.batches[batch], idx); + } + + filter(predicate: Predicate): DataFrameOps { return new FilteredDataFrame(this, predicate); } @@ -50,11 +74,11 @@ export class DataFrame { const length = this.lengths[batch]; // load batches - this.columns = this.batches[batch]; + const columns = this.batches[batch]; // yield all indices for (let idx = -1; ++idx < length;) { - yield idx; + yield new DataFrameRow(columns, idx); } } } @@ -85,10 +109,12 @@ export class DataFrame { } } -class FilteredDataFrame extends DataFrame { - public columns: Vector[]; - constructor (readonly parent: DataFrame, private predicate: Predicate) { - super(parent.batches); +class FilteredDataFrame implements DataFrameOps { + readonly lengths: Uint32Array; + readonly batches: Vector[][]; + constructor (readonly parent: DataFrameOps, private predicate: Predicate) { + this.batches = parent.batches; + this.lengths = parent.lengths; } scan(next: NextFunc) { @@ -133,7 +159,7 @@ class FilteredDataFrame extends DataFrame { return sum; } - filter(predicate: Predicate): DataFrame { + filter(predicate: Predicate): DataFrameOps { return new FilteredDataFrame( this.parent, this.predicate.and(predicate) From 1d60aa1436e81d5d67b3b16a8f6f2c5df7e57189 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 16:07:55 -0500 Subject: [PATCH 10/19] Moved DataFrame ops to Table. DataFrame is now an interface --- js/perf/index.js | 19 ++- js/src/Arrow.externs.ts | 12 +- js/src/Arrow.ts | 8 +- js/src/bin/arrow2csv.ts | 2 +- js/src/dataframe/dataframe.ts | 179 ---------------------- js/src/{dataframe => vector}/predicate.ts | 0 js/src/vector/table.ts | 164 +++++++++++++++++--- 7 files changed, 158 insertions(+), 226 deletions(-) delete mode 100644 js/src/dataframe/dataframe.ts rename js/src/{dataframe => vector}/predicate.ts (100%) diff --git a/js/perf/index.js b/js/perf/index.js index 5ab3e76b24d7a..9527a8e842c5a 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -124,17 +124,16 @@ function createGetByIndexTest(vector) { } function createDataFrameDirectCountTest(table, column, test, value) { - let df = DataFrame.from(table); let colidx = table.columns.findIndex((c)=>c.name === column); if (test == 'gteq') { op = function () { sum = 0; - for (let batch = -1; ++batch < df.lengths.length;) { - const length = df.lengths[batch]; + for (let batch = -1; ++batch < table.lengths.length;) { + const length = table.lengths[batch]; // load batches - const columns = df.batches[batch]; + const columns = table.batches[batch]; // yield all indices for (let idx = -1; ++idx < length;) { @@ -145,11 +144,11 @@ function createDataFrameDirectCountTest(table, column, test, value) { } else if (test == 'eq') { op = function() { sum = 0; - for (let batch = -1; ++batch < df.lengths.length;) { - const length = df.lengths[batch]; + for (let batch = -1; ++batch < table.lengths.length;) { + const length = table.lengths[batch]; // load batches - const columns = df.batches[batch] + const columns = table.batches[batch] // yield all indices for (let idx = -1; ++idx < length;) { @@ -169,13 +168,13 @@ function createDataFrameDirectCountTest(table, column, test, value) { } function createDataFrameFilterCountTest(table, column, test, value) { - let df = DataFrame.from(table); let colidx = table.columns.findIndex((c)=>c.name === column); + let df; if (test == 'gteq') { - df = df.filter(col(column).gteq(value)); + df = table.filter(col(column).gteq(value)); } else if (test == 'eq') { - df = df.filter(col(column).eq(value)); + df = table.filter(col(column).eq(value)); } else { throw new Error(`Unrecognized test "${test}"`); } diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index d3bfdbbf8e123..0685d262cc186 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -50,6 +50,10 @@ Table.prototype.key; Table.prototype.select; /** @type {?} */ Table.prototype.toString; +/** @type {?} */ +Table.prototype.lengths; +/** @type {?} */ +Table.prototype.batches; let Vector = function() {}; /** @type {?} */ @@ -83,14 +87,6 @@ DictionaryVector.prototype.getKey; /** @type {?} */ DictionaryVector.prototype.getValue; -let DataFrame = function () {}; -/** @type {?} */ -DataFrame.prototype.lengths; -/** @type {?} */ -DataFrame.prototype.columns; -/** @type {?} */ -DataFrame.prototype.batches; - let Col = function() {}; /** @type {?} */ Col.prototype.gteq; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index ce7235b8b13d4..d80cfed4864f8 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -45,8 +45,7 @@ import { TimestampVector, } from './vector/numeric'; -import { DataFrame } from './dataframe/dataframe'; -import { lit, col } from './dataframe/predicate'; +import { lit, col } from './vector/predicate'; // closure compiler always erases static method names: // https://github.com/google/closure-compiler/issues/1776 @@ -54,7 +53,6 @@ import { lit, col } from './dataframe/predicate'; Table['from'] = Table.from; Table['fromAsync'] = Table.fromAsync; BoolVector['pack'] = BoolVector.pack; -DataFrame['from'] = DataFrame.from; export { read, readAsync }; export { Table, Vector, StructRow }; @@ -88,8 +86,7 @@ export { FixedSizeListVector, }; -export { DataFrame } from './dataframe/dataframe'; -export { lit, col } from './dataframe/predicate'; +export { lit, col } from './vector/predicate'; /* These exports are needed for the closure umd targets */ @@ -103,7 +100,6 @@ try { Arrow['readAsync'] = readAsync; Arrow['Table'] = Table; Arrow['Vector'] = Vector; - Arrow['DataFrame'] = DataFrame; Arrow['StructRow'] = StructRow; Arrow['BoolVector'] = BoolVector; Arrow['ListVector'] = ListVector; diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 01ef0b848ce75..117d417f1b4fa 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -97,7 +97,7 @@ files.forEach((source) => { printTable(table); }); -function printTable(table: Arrow.Table) { +function printTable(table: Arrow.Table) { let header = [...table.columns.map((_, i) => table.key(i))].map(stringify); let maxColumnWidths = header.map(x => x.length); // Pass one to convert to strings and count max column widths diff --git a/js/src/dataframe/dataframe.ts b/js/src/dataframe/dataframe.ts deleted file mode 100644 index c8db286f28eb4..0000000000000 --- a/js/src/dataframe/dataframe.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { Vector } from "../vector/vector"; -import { StructVector, StructRow } from "../vector/struct"; -import { VirtualVector } from "../vector/virtual"; - -import { Predicate } from "./predicate" - -export type NextFunc = (idx: number, cols: Vector[]) => void; - -export class DataFrameRow extends StructRow { - constructor (batches: Vector[], idx: number) { - super(new StructVector({columns: batches}), idx); - } -} - -export interface DataFrameOps { - readonly batches: Vector[][]; - readonly lengths: Uint32Array; - filter(predicate: Predicate): DataFrameOps; - scan(next: NextFunc): void; - count(): number; -} - -export class DataFrame extends Vector implements DataFrameOps { - readonly lengths: Uint32Array; - constructor(readonly batches: Vector[][]) { - super(); - // for each batch - this.lengths = new Uint32Array(batches.map((batch)=>{ - // verify that every vector has the same length, and return that - // length - // throw an error if the lengths don't match - return batch.reduce((length, col) => { - if (col.length !== length) - throw new Error("Attempted to create a DataFrame with un-aligned vectors"); - return length; - }, batch[0].length); - })); - } - - get(idx: number): DataFrameRow|null { - let batch = 0; - while (idx > this.lengths[batch] && batch < this.lengths.length) - idx -= this.lengths[batch++]; - - if (batch === this.lengths.length) return null; - - else return new DataFrameRow(this.batches[batch], idx); - } - - filter(predicate: Predicate): DataFrameOps { - return new FilteredDataFrame(this, predicate); - } - - scan(next: NextFunc) { - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - - // load batches - const columns = this.batches[batch]; - - // yield all indices - for (let idx = -1; ++idx < length;) { - next(idx, columns) - } - } - } - - count(): number { - return this.lengths.reduce((acc, val) => acc + val); - } - - *[Symbol.iterator]() { - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - - // load batches - const columns = this.batches[batch]; - - // yield all indices - for (let idx = -1; ++idx < length;) { - yield new DataFrameRow(columns, idx); - } - } - } - - static from(table: Vector): DataFrame { - if (table instanceof StructVector) { - const columns = table.columns; - if (isAligned(columns)) { - // StructVector of aligned VirtualVectors - // break up VirtualVectors into batches - const batches = columns[0].vectors.map((_,i) => { - return columns.map((vec: VirtualVector) => { - return vec.vectors[i]; - }); - }); - return new DataFrame(batches); - } else { - return new DataFrame([columns]); - } - } else if (table instanceof VirtualVector && - table.vectors.every((vec) => vec instanceof StructVector)) { - return new DataFrame(table.vectors.map((vec) => { - return (vec as StructVector).columns; - })); - } else { - return new DataFrame([[table]]); - } - } -} - -class FilteredDataFrame implements DataFrameOps { - readonly lengths: Uint32Array; - readonly batches: Vector[][]; - constructor (readonly parent: DataFrameOps, private predicate: Predicate) { - this.batches = parent.batches; - this.lengths = parent.lengths; - } - - scan(next: NextFunc) { - // inlined version of this: - // this.parent.scan((idx, columns) => { - // if (this.predicate(idx, columns)) next(idx, columns); - // }); - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - - // load batches - const columns = this.batches[batch]; - const predicate = this.predicate.bind(columns); - - // yield all indices - for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) next(idx, columns); - } - } - } - - count(): number { - // inlined version of this: - // let sum = 0; - // this.parent.scan((idx, columns) => { - // if (this.predicate(idx, columns)) ++sum; - // }); - // return sum; - let sum = 0; - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - - // load batches - const columns = this.batches[batch]; - const predicate = this.predicate.bind(columns); - - // yield all indices - for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) ++sum; - } - } - return sum; - } - - filter(predicate: Predicate): DataFrameOps { - return new FilteredDataFrame( - this.parent, - this.predicate.and(predicate) - ); - } -} - -function isAligned(columns: Vector[]): columns is VirtualVector[] { - if (columns.every((col) => col instanceof VirtualVector)) { - const virtuals = columns as VirtualVector[] - - return virtuals.slice(1).every((col) => { - return col.aligned(virtuals[0]); - }); - } - return false; -} diff --git a/js/src/dataframe/predicate.ts b/js/src/vector/predicate.ts similarity index 100% rename from js/src/dataframe/predicate.ts rename to js/src/vector/predicate.ts diff --git a/js/src/vector/table.ts b/js/src/vector/table.ts index ca2b66a22da80..e81fe16a94ae8 100644 --- a/js/src/vector/table.ts +++ b/js/src/vector/table.ts @@ -18,44 +18,164 @@ import { Vector } from './vector'; import { StructVector, StructRow } from './struct'; import { read, readAsync } from '../reader/arrow'; +import { Predicate } from './predicate'; -function concatVectors(tableVectors: Vector[], batchVectors: Vector[]) { - return tableVectors.length === 0 ? batchVectors : batchVectors.map((vec, i, _vs, col = tableVectors[i]) => - vec && col && col.concat(vec) || col || vec - ) as Vector[]; +export type NextFunc = (idx: number, cols: Vector[]) => void; + +export class DataFrameRow extends StructRow { + constructor (batch: Vector[], idx: number) { + super(new StructVector({columns: batch}), idx); + } + toString() { + return this.toArray().map((x) => JSON.stringify(x)).join(', '); + } } -export class Table extends StructVector { +export interface DataFrame { + readonly batches: Vector[][]; + readonly lengths: Uint32Array; + filter(predicate: Predicate): DataFrame; + scan(next: NextFunc): void; + count(): number; +} + +function columnsFromBatches(batches: Vector[][]) { + const remaining = batches.slice(1); + return batches[0].map((vec, colidx) => + vec.concat(...remaining.map((batch) => batch[colidx])) + ); +} + +export class Table extends StructVector implements DataFrame { static from(sources?: Iterable | object | string) { - let columns: Vector[] = []; + let batches: Vector[][] = [[]]; if (sources) { - for (let vectors of read(sources)) { - columns = concatVectors(columns, vectors); - } + batches = Array.from(read(sources)); } - return new Table({ columns }); + return new Table({ batches }); } static async fromAsync(sources?: AsyncIterable) { - let columns: Vector[] = []; + let batches: Vector[][] = [[]]; if (sources) { - for await (let vectors of readAsync(sources)) { - columns = columns = concatVectors(columns, vectors); + batches = []; + for await (let batch of readAsync(sources)) { + batches.push(batch); } } - return new Table({ columns }); + return new Table({ batches }); } + + // VirtualVector of each column, spanning batches + readonly columns: Vector[]; + + // List of batches, where each batch is a list of Vectors + readonly batches: Vector[][]; + readonly lengths: Uint32Array; readonly length: number; - constructor(argv: { columns: Vector[] }) { - super(argv); - this.length = Math.max(...this.columns.map((col) => col.length)) | 0; + constructor(argv: { batches: Vector[][] }) { + super({columns: columnsFromBatches(argv.batches)}); + this.batches = argv.batches; + this.lengths = new Uint32Array(this.batches.map((batch) => batch[0].length)); + + this.length = this.lengths.reduce((acc, length) => acc + length); + } + get(idx: number): DataFrameRow { + let batch = 0; + while (idx > this.lengths[batch] && batch < this.lengths.length) + idx -= this.lengths[batch++]; + + if (batch === this.lengths.length) throw new Error("Overflow") + + else return new DataFrameRow(this.batches[batch], idx); + } + filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame(this, predicate); + } + scan(next: NextFunc) { + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + + // yield all indices + for (let idx = -1; ++idx < length;) { + next(idx, columns) + } + } } - get(index: number): TableRow { - return new TableRow(this, index); + count(): number { + return this.lengths.reduce((acc, val) => acc + val); + } + *[Symbol.iterator]() { + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + + // yield all indices + for (let idx = -1; ++idx < length;) { + yield new DataFrameRow(columns, idx); + } + } } } -export class TableRow extends StructRow { - toString() { - return this.toArray().map((x) => JSON.stringify(x)).join(', '); +class FilteredDataFrame implements DataFrame { + readonly lengths: Uint32Array; + readonly batches: Vector[][]; + constructor (readonly parent: DataFrame, private predicate: Predicate) { + this.batches = parent.batches; + this.lengths = parent.lengths; + } + + scan(next: NextFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + const predicate = this.predicate.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + if (predicate(idx, columns)) next(idx, columns); + } + } + } + + count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + const predicate = this.predicate.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + if (predicate(idx, columns)) ++sum; + } + } + return sum; + } + + filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame( + this.parent, + this.predicate.and(predicate) + ); } } From a9fff89040b0eba16fb0f0590854aeec0a2eee35 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 16:50:43 -0500 Subject: [PATCH 11/19] Move Table out of the Vector hierarchy --- js/src/Arrow.ts | 10 +++++----- js/src/bin/arrow2csv.ts | 4 ++-- js/src/{vector => }/predicate.ts | 4 ++-- js/src/{vector => }/table.ts | 27 ++++++++++++++++----------- 4 files changed, 25 insertions(+), 20 deletions(-) rename js/src/{vector => }/predicate.ts (98%) rename js/src/{vector => }/table.ts (89%) diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index d80cfed4864f8..ed9ff577bb1e5 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -import { Table } from './vector/table'; +import { Table, TableRow } from './table'; +import { lit, col } from './predicate'; import { Vector } from './vector/vector'; import { Utf8Vector } from './vector/utf8'; import { DictionaryVector } from './vector/dictionary'; @@ -45,8 +46,6 @@ import { TimestampVector, } from './vector/numeric'; -import { lit, col } from './vector/predicate'; - // closure compiler always erases static method names: // https://github.com/google/closure-compiler/issues/1776 // set them via string indexers to save them from the mangler @@ -55,7 +54,9 @@ Table['fromAsync'] = Table.fromAsync; BoolVector['pack'] = BoolVector.pack; export { read, readAsync }; -export { Table, Vector, StructRow }; +export { Table, TableRow }; +export { lit, col }; +export { Vector, StructRow }; export { Uint64, Int64, Int128 }; export { NumericVectorConstructor } from './vector/numeric'; export { List, TypedArray, TypedArrayConstructor } from './vector/types'; @@ -86,7 +87,6 @@ export { FixedSizeListVector, }; -export { lit, col } from './vector/predicate'; /* These exports are needed for the closure umd targets */ diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 117d417f1b4fa..2bc1600a8408d 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -98,7 +98,7 @@ files.forEach((source) => { }); function printTable(table: Arrow.Table) { - let header = [...table.columns.map((_, i) => table.key(i))].map(stringify); + let header = [...table.columns.map((c) => c.name)].map(stringify); let maxColumnWidths = header.map(x => x.length); // Pass one to convert to strings and count max column widths for (let i = -1, n = table.length - 1; ++i < n;) { @@ -132,4 +132,4 @@ function stringify(x: any) { : `${x}`; } -})(); \ No newline at end of file +})(); diff --git a/js/src/vector/predicate.ts b/js/src/predicate.ts similarity index 98% rename from js/src/vector/predicate.ts rename to js/src/predicate.ts index 263b8646d71fc..c2be4db75750b 100644 --- a/js/src/vector/predicate.ts +++ b/js/src/predicate.ts @@ -1,5 +1,5 @@ -import { Vector } from "../vector/vector"; -import { DictionaryVector } from "../vector/dictionary"; +import { Vector } from "./vector/vector"; +import { DictionaryVector } from "./vector/dictionary"; export type ValueFunc = (idx: number, cols: Vector[]) => T|null; export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; diff --git a/js/src/vector/table.ts b/js/src/table.ts similarity index 89% rename from js/src/vector/table.ts rename to js/src/table.ts index e81fe16a94ae8..4ab34192376f8 100644 --- a/js/src/vector/table.ts +++ b/js/src/table.ts @@ -15,20 +15,25 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector'; -import { StructVector, StructRow } from './struct'; -import { read, readAsync } from '../reader/arrow'; +import { Vector } from './vector/vector'; +import { read, readAsync } from './reader/arrow'; import { Predicate } from './predicate'; export type NextFunc = (idx: number, cols: Vector[]) => void; -export class DataFrameRow extends StructRow { - constructor (batch: Vector[], idx: number) { - super(new StructVector({columns: batch}), idx); +export class TableRow { + constructor (readonly batch: Vector[], readonly idx: number) {} + toArray() { + return this.batch.map((vec) => vec.get(this.idx)); } toString() { return this.toArray().map((x) => JSON.stringify(x)).join(', '); } + *[Symbol.iterator]() { + for (const vec of this.batch) { + yield vec.get(this.idx); + } + } } export interface DataFrame { @@ -46,7 +51,7 @@ function columnsFromBatches(batches: Vector[][]) { ); } -export class Table extends StructVector implements DataFrame { +export class Table implements DataFrame { static from(sources?: Iterable | object | string) { let batches: Vector[][] = [[]]; if (sources) { @@ -73,20 +78,20 @@ export class Table extends StructVector implements DataFrame { readonly lengths: Uint32Array; readonly length: number; constructor(argv: { batches: Vector[][] }) { - super({columns: columnsFromBatches(argv.batches)}); this.batches = argv.batches; + this.columns = columnsFromBatches(this.batches); this.lengths = new Uint32Array(this.batches.map((batch) => batch[0].length)); this.length = this.lengths.reduce((acc, length) => acc + length); } - get(idx: number): DataFrameRow { + get(idx: number): TableRow { let batch = 0; while (idx > this.lengths[batch] && batch < this.lengths.length) idx -= this.lengths[batch++]; if (batch === this.lengths.length) throw new Error("Overflow") - else return new DataFrameRow(this.batches[batch], idx); + else return new TableRow(this.batches[batch], idx); } filter(predicate: Predicate): DataFrame { return new FilteredDataFrame(this, predicate); @@ -116,7 +121,7 @@ export class Table extends StructVector implements DataFrame { // yield all indices for (let idx = -1; ++idx < length;) { - yield new DataFrameRow(columns, idx); + yield new TableRow(columns, idx); } } } From a788db315cf6410bd64fc079325696c0324b45bc Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 16:55:50 -0500 Subject: [PATCH 12/19] Cleanup --- js/perf/index.js | 8 ++++---- js/src/predicate.ts | 9 --------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/js/perf/index.js b/js/perf/index.js index 9527a8e842c5a..0be4db3084dbf 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,10 +16,10 @@ // under the License. // Use the ES5 UMD target as perf baseline -const { col, DataFrame, Table, readVectors } = require('../targets/es5/umd'); -// const { col, DataFrame, Table, readVectors } = require('../targets/es5/cjs'); -// const { col, DataFrame, Table, readVectors } = require('../targets/es2015/umd'); -// const { col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs'); +// const { col, Table, readVectors } = require('../targets/es5/umd'); +// const { col, Table, readVectors } = require('../targets/es5/cjs'); +// const { col, Table, readVectors } = require('../targets/es2015/umd'); +const { col, Table, readVectors } = require('../targets/es2015/cjs'); const config = require('./config'); const Benchmark = require('benchmark'); diff --git a/js/src/predicate.ts b/js/src/predicate.ts index c2be4db75750b..dbfc7479ffbb0 100644 --- a/js/src/predicate.ts +++ b/js/src/predicate.ts @@ -180,15 +180,6 @@ class GTeq extends ComparisonPredicate { const col_func = col.bind(cols); return (idx: number, cols: Vector[]) => col_func(idx, cols) >= lit.v; } - //eval(idx: number, cols: Vector[]) { - // return this.left.eval(idx, cols) >= this.right.eval(idx, cols); - //} - //emitString() { - // return `${this.left.emitString()} >= ${this.right.emitString()}` - //} - //createDictionaryEval(schema, lit: Literal, col: Col): (idx: number, cols: Vector[]) => boolean { - // return this.eval; - //} } export function lit(n: number): Value { return new Literal(n); } From 2e118aba789afb40d6b596625e89cd008773999f Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Thu, 11 Jan 2018 17:02:44 -0500 Subject: [PATCH 13/19] linter --- js/src/Arrow.ts | 2 -- js/src/predicate.ts | 26 +++++++++++++------------- js/src/table.ts | 13 +++++++------ 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index ed9ff577bb1e5..926ee88720bf0 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -87,8 +87,6 @@ export { FixedSizeListVector, }; - - /* These exports are needed for the closure umd targets */ try { const Arrow = eval('exports'); diff --git a/js/src/predicate.ts b/js/src/predicate.ts index dbfc7479ffbb0..2b0be44d472dc 100644 --- a/js/src/predicate.ts +++ b/js/src/predicate.ts @@ -1,29 +1,29 @@ -import { Vector } from "./vector/vector"; -import { DictionaryVector } from "./vector/dictionary"; +import { Vector } from './vector/vector'; +import { DictionaryVector } from './vector/dictionary'; export type ValueFunc = (idx: number, cols: Vector[]) => T|null; export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; export abstract class Value { eq(other: Value|T): Predicate { - if (!(other instanceof Value)) other = new Literal(other); + if (!(other instanceof Value)) { other = new Literal(other); } return new Equals(this, other); } lteq(other: Value|T): Predicate { - if (!(other instanceof Value)) other = new Literal(other); + if (!(other instanceof Value)) { other = new Literal(other); } return new LTeq(this, other); } gteq(other: Value|T): Predicate { - if (!(other instanceof Value)) other = new Literal(other); + if (!(other instanceof Value)) { other = new Literal(other); } return new GTeq(this, other); } } -class Literal extends Value { +class Literal extends Value { constructor(public v: T) { super(); } } -class Col extends Value { +class Col extends Value { vector: Vector; colidx: number; @@ -39,9 +39,9 @@ class Col extends Value { break; } } - if (this.colidx < 0) throw new Error(`Failed to bind Col "${this.name}"`) + if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } } - this.vector = cols[this.colidx] + this.vector = cols[this.colidx]; return this.vector.get.bind(this.vector); } @@ -55,7 +55,7 @@ export abstract class Predicate { ands(): Predicate[] { return [this]; } } -abstract class ComparisonPredicate extends Predicate { +abstract class ComparisonPredicate extends Predicate { constructor(public readonly left: Value, public readonly right: Value) { super(); } @@ -94,7 +94,7 @@ class And extends CombinationPredicate { const right = this.right.bind(cols); return (idx: number, cols: Vector[]) => left(idx, cols) && right(idx, cols); } - ands() : Predicate[] { return this.left.ands().concat(this.right.ands()); } + ands(): Predicate[] { return this.left.ands().concat(this.right.ands()); } } class Or extends CombinationPredicate { @@ -121,7 +121,7 @@ class Equals extends ComparisonPredicate { const col_func = col.bind(cols); if (col.vector instanceof DictionaryVector) { // Assume that there is only one key with the value `lit.v` - let key = -1 + let key = -1; for (; ++key < col.vector.data.length;) { if (col.vector.data.get(key) === lit.v) { break; @@ -138,7 +138,7 @@ class Equals extends ComparisonPredicate { } else { return (idx: number) => { return (col.vector as DictionaryVector).getKey(idx) === key; - } + }; } } else { return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v; diff --git a/js/src/table.ts b/js/src/table.ts index 4ab34192376f8..613699f0b66b1 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -86,12 +86,13 @@ export class Table implements DataFrame { } get(idx: number): TableRow { let batch = 0; - while (idx > this.lengths[batch] && batch < this.lengths.length) + while (idx > this.lengths[batch] && batch < this.lengths.length) { idx -= this.lengths[batch++]; + } - if (batch === this.lengths.length) throw new Error("Overflow") + if (batch === this.lengths.length) { throw new Error('Overflow'); } - else return new TableRow(this.batches[batch], idx); + return new TableRow(this.batches[batch], idx); } filter(predicate: Predicate): DataFrame { return new FilteredDataFrame(this, predicate); @@ -105,7 +106,7 @@ export class Table implements DataFrame { // yield all indices for (let idx = -1; ++idx < length;) { - next(idx, columns) + next(idx, columns); } } } @@ -149,7 +150,7 @@ class FilteredDataFrame implements DataFrame { // yield all indices for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) next(idx, columns); + if (predicate(idx, columns)) { next(idx, columns); } } } } @@ -171,7 +172,7 @@ class FilteredDataFrame implements DataFrame { // yield all indices for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) ++sum; + if (predicate(idx, columns)) { ++sum; } } } return sum; From 2f4a3491e0a8593041e828964095cac77a31d0e8 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Fri, 12 Jan 2018 13:11:20 -0500 Subject: [PATCH 14/19] Minor tweaks --- js/src/predicate.ts | 17 ++++++++++------- js/src/table.ts | 21 +++++++-------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/js/src/predicate.ts b/js/src/predicate.ts index 2b0be44d472dc..1fedc98d1c41e 100644 --- a/js/src/predicate.ts +++ b/js/src/predicate.ts @@ -19,11 +19,11 @@ export abstract class Value { } } -class Literal extends Value { +export class Literal extends Value { constructor(public v: T) { super(); } } -class Col extends Value { +export class Col extends Value { vector: Vector; colidx: number; @@ -55,7 +55,7 @@ export abstract class Predicate { ands(): Predicate[] { return [this]; } } -abstract class ComparisonPredicate extends Predicate { +export abstract class ComparisonPredicate extends Predicate { constructor(public readonly left: Value, public readonly right: Value) { super(); } @@ -105,7 +105,7 @@ class Or extends CombinationPredicate { } } -class Equals extends ComparisonPredicate { +export class Equals extends ComparisonPredicate { protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v == right.v; return () => rtrn; @@ -121,6 +121,9 @@ class Equals extends ComparisonPredicate { const col_func = col.bind(cols); if (col.vector instanceof DictionaryVector) { // Assume that there is only one key with the value `lit.v` + // TODO: add lazily-computed reverse dictionary lookups, associated + // with col.vector.data so that we only have to do this once per + // dictionary let key = -1; for (; ++key < col.vector.data.length;) { if (col.vector.data.get(key) === lit.v) { @@ -146,7 +149,7 @@ class Equals extends ComparisonPredicate { } } -class LTeq extends ComparisonPredicate { +export class LTeq extends ComparisonPredicate { protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v <= right.v; return () => rtrn; @@ -164,7 +167,7 @@ class LTeq extends ComparisonPredicate { } } -class GTeq extends ComparisonPredicate { +export class GTeq extends ComparisonPredicate { protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v >= right.v; return () => rtrn; @@ -183,4 +186,4 @@ class GTeq extends ComparisonPredicate { } export function lit(n: number): Value { return new Literal(n); } -export function col(n: string): Value { return new Col(n); } +export function col(n: string): Col { return new Col(n); } diff --git a/js/src/table.ts b/js/src/table.ts index 613699f0b66b1..620a4a701c80f 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -37,8 +37,6 @@ export class TableRow { } export interface DataFrame { - readonly batches: Vector[][]; - readonly lengths: Uint32Array; filter(predicate: Predicate): DataFrame; scan(next: NextFunc): void; count(): number; @@ -129,23 +127,18 @@ export class Table implements DataFrame { } class FilteredDataFrame implements DataFrame { - readonly lengths: Uint32Array; - readonly batches: Vector[][]; - constructor (readonly parent: DataFrame, private predicate: Predicate) { - this.batches = parent.batches; - this.lengths = parent.lengths; - } + constructor (readonly parent: Table, private predicate: Predicate) {} scan(next: NextFunc) { // inlined version of this: // this.parent.scan((idx, columns) => { // if (this.predicate(idx, columns)) next(idx, columns); // }); - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; // load batches - const columns = this.batches[batch]; + const columns = this.parent.batches[batch]; const predicate = this.predicate.bind(columns); // yield all indices @@ -163,11 +156,11 @@ class FilteredDataFrame implements DataFrame { // }); // return sum; let sum = 0; - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; // load batches - const columns = this.batches[batch]; + const columns = this.parent.batches[batch]; const predicate = this.predicate.bind(columns); // yield all indices From 671914776f3454748c39cb4ee97714ead788e0bd Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 15 Jan 2018 12:39:33 -0500 Subject: [PATCH 15/19] Add DataFrame.countBy operation --- js/perf/index.js | 26 ++++++++++++--- js/perf/table_config.js | 10 ++++-- js/src/table.ts | 73 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 8 deletions(-) diff --git a/js/perf/index.js b/js/perf/index.js index 0be4db3084dbf..d31b6430ec871 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -41,17 +41,23 @@ for (let { name, buffers} of config) { suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } -for (let {name, buffers, tests} of require('./table_config')) { +for (let {name, buffers, countBys, counts} of require('./table_config')) { + const table = Table.from(buffers); + + const dfCountBySuite = new Benchmark.Suite(`DataFrame Count By "${name}"`, { async: true }); + for (countBy of countBys) { + dfCountBySuite.add(createDataFrameCountByTest(table, countBy)); + } + const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter-Scan Count "${name}"`, { async: true }); const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); - const table = Table.from(buffers); - for (test of tests) { + for (test of counts) { dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value)) dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) } - suites.push(dfFilterCountSuite, dfDirectCountSuite) + suites.push(dfCountBySuite, dfFilterCountSuite, dfDirectCountSuite) } console.log('Running apache-arrow performance tests...\n'); @@ -167,6 +173,18 @@ function createDataFrameDirectCountTest(table, column, test, value) { }; } +function createDataFrameCountByTest(table, column) { + let colidx = table.columns.findIndex((c)=>c.name === column); + + return { + async: true, + name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}`, + fn() { + table.countBy(col(column)); + } + }; +} + function createDataFrameFilterCountTest(table, column, test, value) { let colidx = table.columns.findIndex((c)=>c.name === column); let df; diff --git a/js/perf/table_config.js b/js/perf/table_config.js index 3c045e4571e44..e3c332c870f38 100644 --- a/js/perf/table_config.js +++ b/js/perf/table_config.js @@ -22,7 +22,10 @@ const glob = require('glob'); const config = []; const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); -tests = { +countBys = { + "tracks": ['origin', 'destination'] +} +counts = { "tracks": [ {col: 'lat', test: 'gteq', value: 0 }, {col: 'lng', test: 'gteq', value: 0 }, @@ -32,11 +35,12 @@ tests = { for (const filename of filenames) { const { name } = path.parse(filename); - if (name in tests) { + if (name in counts) { config.push({ name, buffers: [fs.readFileSync(filename)], - tests: tests[name] + countBys: countBys[name], + counts: counts[name], }); } } diff --git a/js/src/table.ts b/js/src/table.ts index 620a4a701c80f..6f312746f2c71 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -16,8 +16,10 @@ // under the License. import { Vector } from './vector/vector'; +import { DictionaryVector } from './vector/dictionary'; +import { Uint32Vector } from './vector/numeric'; import { read, readAsync } from './reader/arrow'; -import { Predicate } from './predicate'; +import { Col, Predicate } from './predicate'; export type NextFunc = (idx: number, cols: Vector[]) => void; @@ -40,6 +42,7 @@ export interface DataFrame { filter(predicate: Predicate): DataFrame; scan(next: NextFunc): void; count(): number; + countBy(col: (Col|string)): Table; } function columnsFromBatches(batches: Vector[][]) { @@ -111,6 +114,40 @@ export class Table implements DataFrame { count(): number { return this.lengths.reduce((acc, val) => acc + val); } + countBy(count_by: (Col|string)): Table { + if (count_by instanceof String) { + count_by = new Col(count_by); + } + + // the last batch will have the most complete dictionary, use it's data + // vector as our count by keys + count_by.bind(this.batches[this.batches.length - 1]); + if (!(count_by.vector instanceof DictionaryVector)) { + throw new Error("countBy currently only supports dictionary-encoded columns"); + } + + let keys: Vector = (count_by.vector as DictionaryVector).data; + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + let counts: Uint32Array = new Uint32Array(keys.length); + + + for (let batch = -1; ++batch < this.lengths.length;) { + const length = this.lengths[batch]; + + // load batches + const columns = this.batches[batch]; + count_by.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + let key = (count_by.vector as DictionaryVector).getKey(idx) + if (key !== null) { counts[key]++; } + } + } + + return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; @@ -177,4 +214,38 @@ class FilteredDataFrame implements DataFrame { this.predicate.and(predicate) ); } + + countBy(count_by: (Col|string)): Table { + if (count_by instanceof String) { + count_by = new Col(count_by); + } + + // the last batch will have the most complete dictionary, use it's data + // vector as our count by keys + count_by.bind(this.parent.batches[this.parent.batches.length - 1]); + if (!(count_by.vector instanceof DictionaryVector)) { + throw new Error("countBy currently only supports dictionary-encoded columns"); + } + + let keys: Vector = (count_by.vector as DictionaryVector).data; + let counts: Uint32Array = new Uint32Array(keys.length); + + + for (let batch = -1; ++batch < this.parent.lengths.length;) { + const length = this.parent.lengths[batch]; + + // load batches + const columns = this.parent.batches[batch]; + const predicate = this.predicate.bind(columns); + count_by.bind(columns); + + // yield all indices + for (let idx = -1; ++idx < length;) { + let key = (count_by.vector as DictionaryVector).getKey(idx) + if (key !== null && predicate(idx, columns)) { counts[key]++; } + } + } + + return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + } } From 724488702a6aa8abb16d3a9f6ffb8b82ccebbe11 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 15 Jan 2018 17:08:21 -0500 Subject: [PATCH 16/19] Add table unit tests... .. also found and resolved some minor bugs (get(idx) batch length check should be <=, various extern issues with UMD builds) --- js/src/Arrow.externs.ts | 10 + js/src/Arrow.ts | 13 +- js/src/table.ts | 28 ++- js/test/unit/table-tests.ts | 371 ++++++++++++++++++++++++++++++++++++ 4 files changed, 411 insertions(+), 11 deletions(-) create mode 100644 js/test/unit/table-tests.ts diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index 0685d262cc186..abc11eff509d9 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -54,6 +54,16 @@ Table.prototype.toString; Table.prototype.lengths; /** @type {?} */ Table.prototype.batches; +/** @type {?} */ +Table.prototype.countBy; +/** @type {?} */ +Table.prototype.scan; +/** @type {?} */ +Table.prototype.get; + +let CountByResult = function() {}; +/** @type {?} */ +CountByResult.prototype.asJSON; let Vector = function() {}; /** @type {?} */ diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 926ee88720bf0..21eb2976d44a4 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -import { Table, TableRow } from './table'; -import { lit, col } from './predicate'; +import { Table, TableRow, CountByResult } from './table'; +import { lit, col, Col, Value } from './predicate'; import { Vector } from './vector/vector'; import { Utf8Vector } from './vector/utf8'; import { DictionaryVector } from './vector/dictionary'; @@ -54,8 +54,8 @@ Table['fromAsync'] = Table.fromAsync; BoolVector['pack'] = BoolVector.pack; export { read, readAsync }; -export { Table, TableRow }; -export { lit, col }; +export { Table, TableRow, CountByResult }; +export { lit, col, Col, Value }; export { Vector, StructRow }; export { Uint64, Int64, Int128 }; export { NumericVectorConstructor } from './vector/numeric'; @@ -94,9 +94,11 @@ try { // string indexers tell closure compiler not to rename these properties Arrow['lit'] = lit; Arrow['col'] = col; + Arrow['Col'] = Col; Arrow['read'] = read; - Arrow['readAsync'] = readAsync; + Arrow['Value'] = Value; Arrow['Table'] = Table; + Arrow['readAsync'] = readAsync; Arrow['Vector'] = Vector; Arrow['StructRow'] = StructRow; Arrow['BoolVector'] = BoolVector; @@ -120,6 +122,7 @@ try { Arrow['Float32Vector'] = Float32Vector; Arrow['Float64Vector'] = Float64Vector; Arrow['DecimalVector'] = DecimalVector; + Arrow['CountByResult'] = CountByResult; Arrow['TimestampVector'] = TimestampVector; Arrow['DictionaryVector'] = DictionaryVector; Arrow['FixedSizeListVector'] = FixedSizeListVector; diff --git a/js/src/table.ts b/js/src/table.ts index 6f312746f2c71..f00b5ef9da1df 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -42,7 +42,7 @@ export interface DataFrame { filter(predicate: Predicate): DataFrame; scan(next: NextFunc): void; count(): number; - countBy(col: (Col|string)): Table; + countBy(col: (Col|string)): CountByResult; } function columnsFromBatches(batches: Vector[][]) { @@ -87,7 +87,7 @@ export class Table implements DataFrame { } get(idx: number): TableRow { let batch = 0; - while (idx > this.lengths[batch] && batch < this.lengths.length) { + while (idx >= this.lengths[batch] && batch < this.lengths.length) { idx -= this.lengths[batch++]; } @@ -114,7 +114,7 @@ export class Table implements DataFrame { count(): number { return this.lengths.reduce((acc, val) => acc + val); } - countBy(count_by: (Col|string)): Table { + countBy(count_by: (Col|string)): CountByResult { if (count_by instanceof String) { count_by = new Col(count_by); } @@ -146,7 +146,7 @@ export class Table implements DataFrame { } } - return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + return new CountByResult(keys, new Uint32Vector({data: counts})) } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { @@ -215,7 +215,7 @@ class FilteredDataFrame implements DataFrame { ); } - countBy(count_by: (Col|string)): Table { + countBy(count_by: (Col|string)): CountByResult { if (count_by instanceof String) { count_by = new Col(count_by); } @@ -246,6 +246,22 @@ class FilteredDataFrame implements DataFrame { } } - return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + return new CountByResult(keys, new Uint32Vector({data: counts})) + } +} + +export class CountByResult extends Table implements DataFrame { + constructor(readonly keys: Vector, readonly counts: Vector) { + super({batches: [[keys, counts]]}); + } + + asJSON(): Object { + let result: {[key: string]: number|null} = {}; + + for (let i = -1; ++i < this.length;) { + result[this.keys.get(i)] = this.counts.get(i); + } + + return result; } } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts new file mode 100644 index 0000000000000..33fb2d178b0d2 --- /dev/null +++ b/js/test/unit/table-tests.ts @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Arrow, { +} from '../Arrow'; + +const { + col, + Table, +} = Arrow; + +describe(`Table`, () => { + describe(`single record batch`, () => { + const table = Table.from({ + "schema": { + "fields": [ + { + "name": "f32", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [], + }, + { + "name": "i32", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [], + }, + { + "name": "dictionary", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [], + "dictionary": { + "id": 0, + "indexType": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "isOrdered": false + } + } + ] + }, + "dictionaries": [{ + "id": 0, + "data": { + "count": 3, + "columns": [ + { + "name": "DICT0", + "count": 3, + "VALIDITY": [], + "OFFSET": [ + 0, + 1, + 2, + 3 + ], + "DATA": [ + "a", + "b", + "c", + ] + } + ] + } + }], + "batches": [{ + "count": 7, + "columns": [ + { + "name": "f32", + "count": 7, + "VALIDITY": [], + "DATA": [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] + }, + { + "name": "i32", + "count": 7, + "VALIDITY": [], + "DATA": [-1, 1, -1, 1, -1, 1, -1] + }, + { + "name": "dictionary", + "count": 7, + "VALIDITY": [], + "DATA": [0, 1, 2, 0, 1, 2, 0] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'] + ] + test(`has the correct length`, () => { + expect(table.length).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, cols) => { + expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(4); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(4); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 2, + 'c': 2, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 1, + 'b': 1, + 'c': 1, + }); + }); + }); + describe(`multiple record batches`, () => { + const table = Table.from({ + "schema": { + "fields": [ + { + "name": "f32", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [], + }, + { + "name": "i32", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [], + }, + { + "name": "dictionary", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [], + "dictionary": { + "id": 0, + "indexType": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "isOrdered": false + } + } + ] + }, + "dictionaries": [{ + "id": 0, + "data": { + "count": 3, + "columns": [ + { + "name": "DICT0", + "count": 3, + "VALIDITY": [], + "OFFSET": [ + 0, + 1, + 2, + 3 + ], + "DATA": [ + "a", + "b", + "c", + ] + } + ] + } + }], + "batches": [{ + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [-0.3, -0.2, -0.1] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [-1, 1, -1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }, { + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [0, 0.1, 0.2] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [1, -1, 1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }, { + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [0.3, 0.2, 0.1] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [-1, 1, -1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'], + [new Float32Array([ 0.2])[0], 1, 'b'], + [new Float32Array([ 0.1])[0], -1, 'c'], + ] + test(`has the correct length`, () => { + expect(table.length).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, cols) => { + expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(6); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(5); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 3, + 'c': 3, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 1, + 'b': 2, + 'c': 1, + }); + }); + }); +}); From 20717d59379b6c5f590e0a6771a3e0818ddc3de8 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 15 Jan 2018 17:53:22 -0500 Subject: [PATCH 17/19] Fixed countBy(string) --- js/perf/index.js | 2 +- js/src/table.ts | 4 ++-- js/test/unit/table-tests.ts | 16 +++++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/js/perf/index.js b/js/perf/index.js index d31b6430ec871..29d5edf56de8e 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -180,7 +180,7 @@ function createDataFrameCountByTest(table, column) { async: true, name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}`, fn() { - table.countBy(col(column)); + table.countBy(column); } }; } diff --git a/js/src/table.ts b/js/src/table.ts index f00b5ef9da1df..554844be2c8c4 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -115,7 +115,7 @@ export class Table implements DataFrame { return this.lengths.reduce((acc, val) => acc + val); } countBy(count_by: (Col|string)): CountByResult { - if (count_by instanceof String) { + if (!(count_by instanceof Col)) { count_by = new Col(count_by); } @@ -216,7 +216,7 @@ class FilteredDataFrame implements DataFrame { } countBy(count_by: (Col|string)): CountByResult { - if (count_by instanceof String) { + if (!(count_by instanceof Col)) { count_by = new Col(count_by); } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 33fb2d178b0d2..9b19c9e56a243 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -159,14 +159,21 @@ describe(`Table`, () => { expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); }); test(`countBy on dictionary returns the correct counts`, () => { + // Make sure countBy works both with and without the Col wrapper + // class expect(table.countBy(col('dictionary')).asJSON()).toEqual({ 'a': 3, 'b': 2, 'c': 2, }); + expect(table.countBy('dictionary').asJSON()).toEqual({ + 'a': 3, + 'b': 2, + 'c': 2, + }); }); test(`countBy on dictionary with filter returns the correct counts`, () => { - expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + expect(table.filter(col('i32').eq(1)).countBy('dictionary').asJSON()).toEqual({ 'a': 1, 'b': 1, 'c': 1, @@ -354,11 +361,18 @@ describe(`Table`, () => { expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); }); test(`countBy on dictionary returns the correct counts`, () => { + // Make sure countBy works both with and without the Col wrapper + // class expect(table.countBy(col('dictionary')).asJSON()).toEqual({ 'a': 3, 'b': 3, 'c': 3, }); + expect(table.countBy('dictionary').asJSON()).toEqual({ + 'a': 3, + 'b': 3, + 'c': 3, + }); }); test(`countBy on dictionary with filter returns the correct counts`, () => { expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ From edcbdbed1f45524561e362b9eb4a7ce5c88f0ecc Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Tue, 16 Jan 2018 10:37:33 -0500 Subject: [PATCH 18/19] cleanup --- js/src/table.ts | 49 +++--- js/test/unit/table-tests.ts | 290 ++++++++++++++++++------------------ 2 files changed, 175 insertions(+), 164 deletions(-) diff --git a/js/src/table.ts b/js/src/table.ts index 554844be2c8c4..d4fe5a93223d8 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -54,14 +54,17 @@ function columnsFromBatches(batches: Vector[][]) { export class Table implements DataFrame { static from(sources?: Iterable | object | string) { - let batches: Vector[][] = [[]]; + let batches: Vector[][] = []; if (sources) { - batches = Array.from(read(sources)); + batches = []; + for (let batch of read(sources)) { + batches.push(batch); + } } return new Table({ batches }); } static async fromAsync(sources?: AsyncIterable) { - let batches: Vector[][] = [[]]; + let batches: Vector[][] = []; if (sources) { batches = []; for await (let batch of readAsync(sources)) { @@ -119,18 +122,17 @@ export class Table implements DataFrame { count_by = new Col(count_by); } - // the last batch will have the most complete dictionary, use it's data - // vector as our count by keys + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary count_by.bind(this.batches[this.batches.length - 1]); if (!(count_by.vector instanceof DictionaryVector)) { - throw new Error("countBy currently only supports dictionary-encoded columns"); + throw new Error('countBy currently only supports dictionary-encoded columns'); } - let keys: Vector = (count_by.vector as DictionaryVector).data; + let data: Vector = (count_by.vector as DictionaryVector).data; // TODO: Adjust array byte width based on overall length // (e.g. if this.length <= 255 use Uint8Array, etc...) - let counts: Uint32Array = new Uint32Array(keys.length); - + let counts: Uint32Array = new Uint32Array(data.length); for (let batch = -1; ++batch < this.lengths.length;) { const length = this.lengths[batch]; @@ -138,15 +140,16 @@ export class Table implements DataFrame { // load batches const columns = this.batches[batch]; count_by.bind(columns); + const keys: Vector = (count_by.vector as DictionaryVector).keys; // yield all indices for (let idx = -1; ++idx < length;) { - let key = (count_by.vector as DictionaryVector).getKey(idx) + let key = keys.get(idx); if (key !== null) { counts[key]++; } } } - return new CountByResult(keys, new Uint32Vector({data: counts})) + return new CountByResult(data, new Uint32Vector({data: counts})); } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { @@ -220,16 +223,17 @@ class FilteredDataFrame implements DataFrame { count_by = new Col(count_by); } - // the last batch will have the most complete dictionary, use it's data - // vector as our count by keys + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary count_by.bind(this.parent.batches[this.parent.batches.length - 1]); if (!(count_by.vector instanceof DictionaryVector)) { - throw new Error("countBy currently only supports dictionary-encoded columns"); + throw new Error('countBy currently only supports dictionary-encoded columns'); } - let keys: Vector = (count_by.vector as DictionaryVector).data; - let counts: Uint32Array = new Uint32Array(keys.length); - + const data: Vector = (count_by.vector as DictionaryVector).data; + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(data.length); for (let batch = -1; ++batch < this.parent.lengths.length;) { const length = this.parent.lengths[batch]; @@ -238,28 +242,29 @@ class FilteredDataFrame implements DataFrame { const columns = this.parent.batches[batch]; const predicate = this.predicate.bind(columns); count_by.bind(columns); + const keys: Vector = (count_by.vector as DictionaryVector).keys; // yield all indices for (let idx = -1; ++idx < length;) { - let key = (count_by.vector as DictionaryVector).getKey(idx) + let key = keys.get(idx); if (key !== null && predicate(idx, columns)) { counts[key]++; } } } - return new CountByResult(keys, new Uint32Vector({data: counts})) + return new CountByResult(data, new Uint32Vector({data: counts})); } } export class CountByResult extends Table implements DataFrame { - constructor(readonly keys: Vector, readonly counts: Vector) { - super({batches: [[keys, counts]]}); + constructor(readonly values: Vector, readonly counts: Vector) { + super({batches: [[values, counts]]}); } asJSON(): Object { let result: {[key: string]: number|null} = {}; for (let i = -1; ++i < this.length;) { - result[this.keys.get(i)] = this.counts.get(i); + result[this.values.get(i)] = this.counts.get(i); } return result; diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 9b19c9e56a243..2b818d7ff70ea 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -26,90 +26,90 @@ const { describe(`Table`, () => { describe(`single record batch`, () => { const table = Table.from({ - "schema": { - "fields": [ + 'schema': { + 'fields': [ { - "name": "f32", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "i32", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "dictionary", - "type": { - "name": "utf8" + 'name': 'dictionary', + 'type': { + 'name': 'utf8' }, - "nullable": false, - "children": [], - "dictionary": { - "id": 0, - "indexType": { - "name": "int", - "isSigned": true, - "bitWidth": 8 + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 }, - "isOrdered": false + 'isOrdered': false } } ] }, - "dictionaries": [{ - "id": 0, - "data": { - "count": 3, - "columns": [ + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ { - "name": "DICT0", - "count": 3, - "VALIDITY": [], - "OFFSET": [ + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ 0, 1, 2, 3 ], - "DATA": [ - "a", - "b", - "c", + 'DATA': [ + 'a', + 'b', + 'c', ] } ] } }], - "batches": [{ - "count": 7, - "columns": [ + 'batches': [{ + 'count': 7, + 'columns': [ { - "name": "f32", - "count": 7, - "VALIDITY": [], - "DATA": [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] + 'name': 'f32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] }, { - "name": "i32", - "count": 7, - "VALIDITY": [], - "DATA": [-1, 1, -1, 1, -1, 1, -1] + 'name': 'i32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-1, 1, -1, 1, -1, 1, -1] }, { - "name": "dictionary", - "count": 7, - "VALIDITY": [], - "DATA": [0, 1, 2, 0, 1, 2, 0] + 'name': 'dictionary', + 'count': 7, + 'VALIDITY': [], + 'DATA': [0, 1, 2, 0, 1, 2, 0] } ] }] @@ -125,7 +125,7 @@ describe(`Table`, () => { [new Float32Array([ 0.1])[0], -1, 'b'], [new Float32Array([ 0.2])[0], 1, 'c'], [new Float32Array([ 0.3])[0], -1, 'a'] - ] + ]; test(`has the correct length`, () => { expect(table.length).toEqual(values.length); }); @@ -143,7 +143,7 @@ describe(`Table`, () => { test(`scans expected values`, () => { let expected_idx = 0; table.scan((idx, cols) => { - expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + expect(cols.map((c) => c.get(idx))).toEqual(values[expected_idx++]); }); }); test(`count() returns the correct length`, () => { @@ -179,137 +179,140 @@ describe(`Table`, () => { 'c': 1, }); }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); }); describe(`multiple record batches`, () => { const table = Table.from({ - "schema": { - "fields": [ + 'schema': { + 'fields': [ { - "name": "f32", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "i32", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 }, - "nullable": false, - "children": [], + 'nullable': false, + 'children': [], }, { - "name": "dictionary", - "type": { - "name": "utf8" + 'name': 'dictionary', + 'type': { + 'name': 'utf8' }, - "nullable": false, - "children": [], - "dictionary": { - "id": 0, - "indexType": { - "name": "int", - "isSigned": true, - "bitWidth": 8 + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 }, - "isOrdered": false + 'isOrdered': false } } ] }, - "dictionaries": [{ - "id": 0, - "data": { - "count": 3, - "columns": [ + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ { - "name": "DICT0", - "count": 3, - "VALIDITY": [], - "OFFSET": [ + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ 0, 1, 2, 3 ], - "DATA": [ - "a", - "b", - "c", + 'DATA': [ + 'a', + 'b', + 'c', ] } ] } }], - "batches": [{ - "count": 3, - "columns": [ + 'batches': [{ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [-0.3, -0.2, -0.1] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [-1, 1, -1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }, { - "count": 3, - "columns": [ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [0, 0.1, 0.2] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 0.1, 0.2] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [1, -1, 1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [1, -1, 1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }, { - "count": 3, - "columns": [ + 'count': 3, + 'columns': [ { - "name": "f32", - "count": 3, - "VALIDITY": [], - "DATA": [0.3, 0.2, 0.1] + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0.3, 0.2, 0.1] }, { - "name": "i32", - "count": 3, - "VALIDITY": [], - "DATA": [-1, 1, -1] + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] }, { - "name": "dictionary", - "count": 3, - "VALIDITY": [], - "DATA": [0, 1, 2] + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] } ] }] @@ -327,7 +330,7 @@ describe(`Table`, () => { [new Float32Array([ 0.3])[0], -1, 'a'], [new Float32Array([ 0.2])[0], 1, 'b'], [new Float32Array([ 0.1])[0], -1, 'c'], - ] + ]; test(`has the correct length`, () => { expect(table.length).toEqual(values.length); }); @@ -345,7 +348,7 @@ describe(`Table`, () => { test(`scans expected values`, () => { let expected_idx = 0; table.scan((idx, cols) => { - expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + expect(cols.map((c) => c.get(idx))).toEqual(values[expected_idx++]); }); }); test(`count() returns the correct length`, () => { @@ -381,5 +384,8 @@ describe(`Table`, () => { 'c': 1, }); }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); }); }); From e20decd57f92cc6c1fc1e807f11f7bb9d825ebde Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Tue, 16 Jan 2018 11:11:23 -0500 Subject: [PATCH 19/19] Add license headers --- js/src/predicate.ts | 17 +++++++++++++++++ js/test/data/tables/generate.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/js/src/predicate.ts b/js/src/predicate.ts index 1fedc98d1c41e..a80e56ee599e5 100644 --- a/js/src/predicate.ts +++ b/js/src/predicate.ts @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + import { Vector } from './vector/vector'; import { DictionaryVector } from './vector/dictionary'; diff --git a/js/test/data/tables/generate.py b/js/test/data/tables/generate.py index bf663fb0b1f9f..da19c6a0728c0 100644 --- a/js/test/data/tables/generate.py +++ b/js/test/data/tables/generate.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import pyarrow as pa import random import numpy as np