From 46accd570414dfa3d16d4e49dd5e7f4a8244a314 Mon Sep 17 00:00:00 2001
From: Michael Darakananda <pongad@google.com>
Date: Fri, 27 Oct 2017 16:45:11 +1100
Subject: [PATCH] bigquery: add simple benchmark

Runs queries and measure time to first response
and time to iterate all rows
---
 bigquery/benchmark/README.md    |  8 ++++++++
 bigquery/benchmark/benchmark.py | 32 ++++++++++++++++++++++++++++++++
 bigquery/benchmark/queries.json | 10 ++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 bigquery/benchmark/README.md
 create mode 100644 bigquery/benchmark/benchmark.py
 create mode 100644 bigquery/benchmark/queries.json

diff --git a/bigquery/benchmark/README.md b/bigquery/benchmark/README.md
new file mode 100644
index 000000000000..435926acb045
--- /dev/null
+++ b/bigquery/benchmark/README.md
@@ -0,0 +1,8 @@
+# BigQuery Benchmark
+This directory contains benchmarks for BigQuery client.
+
+## Usage
+`python benchmark.py queries.json`
+
+BigQuery service caches requests so the benchmark should be run
+at least twice, disregarding the first result.
diff --git a/bigquery/benchmark/benchmark.py b/bigquery/benchmark/benchmark.py
new file mode 100644
index 000000000000..0281edbd1b6b
--- /dev/null
+++ b/bigquery/benchmark/benchmark.py
@@ -0,0 +1,32 @@
+from google.cloud import bigquery
+from datetime import datetime
+import json
+import sys
+
+if len(sys.argv) < 2:
+    raise Exception('need query file, usage: python {0} <queries.json>'.format(sys.argv[0]))
+
+with open(sys.argv[1], 'r') as f:
+    queries = json.loads(f.read())
+
+client = bigquery.Client()
+
+for query in queries:
+    start_time = datetime.now()
+    job = client.query(query)
+    rows = job.result()
+
+    num_rows = 0
+    num_cols = None
+    first_byte_time = None
+
+    for row in rows:
+        if num_rows == 0:
+            num_cols = len(row)
+            first_byte_time = datetime.now() - start_time
+        elif num_cols != len(row):
+            raise Exception('found {0} columsn, expected {1}'.format(len(row), num_cols))
+        num_rows += 1
+    total_time = datetime.now() - start_time
+    print "query {0}: {1} rows, {2} cols, first byte {3} sec, total {4} sec"\
+        .format(query, num_rows, num_cols, first_byte_time.total_seconds(), total_time.total_seconds())
diff --git a/bigquery/benchmark/queries.json b/bigquery/benchmark/queries.json
new file mode 100644
index 000000000000..13fed38b52b3
--- /dev/null
+++ b/bigquery/benchmark/queries.json
@@ -0,0 +1,10 @@
+[
+  "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 10000",
+  "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 100000",
+  "SELECT * FROM `nyc-tlc.yellow.trips` LIMIT 1000000",
+  "SELECT title FROM `bigquery-public-data.samples.wikipedia` ORDER BY title LIMIT 1000",
+  "SELECT title, id, timestamp, contributor_ip FROM `bigquery-public-data.samples.wikipedia` WHERE title like 'Blo%' ORDER BY id",
+  "SELECT * FROM `bigquery-public-data.baseball.games_post_wide` ORDER BY gameId",
+  "SELECT * FROM `bigquery-public-data.samples.github_nested` WHERE repository.has_downloads ORDER BY repository.created_at LIMIT 10000",
+  "SELECT repo_name, path FROM `bigquery-public-data.github_repos.files` WHERE path LIKE '%.java' ORDER BY id LIMIT 1000000"
+]