From 46accd570414dfa3d16d4e49dd5e7f4a8244a314 Mon Sep 17 00:00:00 2001 From: Michael Darakananda Date: Fri, 27 Oct 2017 16:45:11 +1100 Subject: [PATCH] bigquery: add simple benchmark Runs queries and measure time to first response and time to iterate all rows --- bigquery/benchmark/README.md | 8 ++++++++ bigquery/benchmark/benchmark.py | 32 ++++++++++++++++++++++++++++++++ bigquery/benchmark/queries.json | 10 ++++++++++ 3 files changed, 50 insertions(+) create mode 100644 bigquery/benchmark/README.md create mode 100644 bigquery/benchmark/benchmark.py create mode 100644 bigquery/benchmark/queries.json diff --git a/bigquery/benchmark/README.md b/bigquery/benchmark/README.md new file mode 100644 index 000000000000..435926acb045 --- /dev/null +++ b/bigquery/benchmark/README.md @@ -0,0 +1,8 @@ +# BigQuery Benchmark +This directory contains benchmarks for BigQuery client. + +## Usage +`python benchmark.py queries.json` + +BigQuery service caches requests so the benchmark should be run +at least twice, disregarding the first result. diff --git a/bigquery/benchmark/benchmark.py b/bigquery/benchmark/benchmark.py new file mode 100644 index 000000000000..0281edbd1b6b --- /dev/null +++ b/bigquery/benchmark/benchmark.py @@ -0,0 +1,32 @@ +from google.cloud import bigquery +from datetime import datetime +import json +import sys + +if len(sys.argv) < 2: + raise Exception('need query file, usage: python {0} '.format(sys.argv[0])) + +with open(sys.argv[1], 'r') as f: + queries = json.loads(f.read()) + +client = bigquery.Client() + +for query in queries: + start_time = datetime.now() + job = client.query(query) + rows = job.result() + + num_rows = 0 + num_cols = None + first_byte_time = None + + for row in rows: + if num_rows == 0: + num_cols = len(row) + first_byte_time = datetime.now() - start_time + elif num_cols != len(row): + raise Exception('found {0} columsn, expected {1}'.format(len(row), num_cols)) + num_rows += 1 + total_time = datetime.now() - start_time + print "query {0}: {1} rows, {2} cols, first byte {3} sec, total {4} sec"\ + .format(query, num_rows, num_cols, first_byte_time.total_seconds(), total_time.total_seconds()) diff --git a/bigquery/benchmark/queries.json b/bigquery/benchmark/queries.json new file mode 100644 index 000000000000..13fed38b52b3 --- /dev/null +++ b/bigquery/benchmark/queries.json @@ -0,0 +1,10 @@ +[ + "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 10000", + "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 100000", + "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 1000000", + "SELECT title FROM `bigquery-public-data.samples.wikipedia` ORDER BY title LIMIT 1000", + "SELECT title, id, timestamp, contributor_ip FROM `bigquery-public-data.samples.wikipedia` WHERE title like 'Blo%' ORDER BY id", + "SELECT * FROM `bigquery-public-data.baseball.games_post_wide` ORDER BY gameId", + "SELECT * FROM `bigquery-public-data.samples.github_nested` WHERE repository.has_downloads ORDER BY repository.created_at LIMIT 10000", + "SELECT repo_name, path FROM `bigquery-public-data.github_repos.files` WHERE path LIKE '%.java' ORDER BY id LIMIT 1000000" +]