diff --git a/js/generate.py b/js/generate.py new file mode 100644 index 0000000000000..bf663fb0b1f9f --- /dev/null +++ b/js/generate.py @@ -0,0 +1,36 @@ +import pyarrow as pa +import random +import numpy as np +import pandas as pd + + +cities = [u'Charlottesville', u'New York', u'San Francisco', u'Seattle', u'Terre Haute', u'Washington, DC'] + +def generate_batch(batch_len): + return pa.RecordBatch.from_arrays([ + pa.Array.from_pandas(pd.Series(np.random.uniform(-90,90,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Series(np.random.uniform(-180,180,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)) + ], ['lat', 'lng', 'origin', 'destination']) + +def write_record_batches(fd, batch_len, num_batches): + writer = pa.ipc.RecordBatchStreamWriter(fd, generate_batch(1).schema) + for batch in range(num_batches): + writer.write_batch(generate_batch(batch_len)) + + writer.close() + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='number of batches') + parser.add_argument('-n', '--num-batches', help='number of batches', type=int, default=10) + parser.add_argument('-b', '--batch-size', help='size of each batch', type=int, default=100000) + + args = parser.parse_args() + + print "Writing {} {}-element batches to '{}'".format(args.num_batches, args.batch_size, args.filename) + with open(args.filename, 'w') as fd: + write_record_batches(fd, args.batch_size, args.num_batches) diff --git a/js/perf/index.js b/js/perf/index.js index 9eac40e64ac71..03501913d8155 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -41,6 +41,21 @@ for (let { name, buffers} of config) { suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } +for (let {name, buffers, tests} of require('./table_config')) { + const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true }); + const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true }); + const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true }); + const table = Table.from(buffers); + + tableIterateSuite.add(createTableIterateTest(table)); + for (test of tests) { + tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value)) + vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value)) + } + + suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite) +} + console.log('Running apache-arrow performance tests...\n'); run(); @@ -109,3 +124,66 @@ function createGetByIndexTest(vector) { } }; } + +function createVectorCountByTest(vector, test, value) { + let op; + if (test == 'gteq') { + op = function () { + sum = 0; + for (cell of vector) { + sum += (cell >= value) + } + } + } else if (test == 'eq') { + op = function () { + sum = 0; + for (cell of vector) { + sum += (cell == value) + } + } + } else { + throw new Error(`Unrecognized test "$test"`); + } + + return { + async: true, + name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}, test: ${test}, value: ${value}`, + fn: op + }; +} + +function createTableIterateTest(table) { + let row; + return { + async: true, + name: `length: ${table.length}`, + fn() { for (row of table) {} } + }; +} + +function createTableCountByTest(table, column, test, value) { + let op; + if (test == 'gteq') { + op = function () { + sum = 0; + for (row of table) { + sum += (row.get(column) >= value) + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + for (row of table) { + sum += (row.get(column) == value) + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`, + fn: op + }; +} diff --git a/js/perf/table_config.js b/js/perf/table_config.js new file mode 100644 index 0000000000000..7bface6d2cdde --- /dev/null +++ b/js/perf/table_config.js @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const fs = require('fs'); +const path = require('path'); +const glob = require('glob'); + +const config = []; +const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); + +tests = [ + {col: 0, test: 'gteq', value: 0 }, + {col: 1, test: 'gteq', value: 0 }, + {col: 2, test: 'eq', value: 'Seattle'}, +] + +for (const filename of filenames) { + const { name } = path.parse(filename); + config.push({ name, buffers: [fs.readFileSync(filename)], tests }); +} + +module.exports = config;