Add a pure python sql engine benchmark.

tobymao · tobymao · commit 8d43f22a1cd5 · 2023-06-15T22:55:07.000-07:00
diff --git a/pyperformance/data-files/benchmarks/bm_sqlglot/requirements.txt b/pyperformance/data-files/benchmarks/bm_sqlglot/requirements.txt
@@ -1 +1 @@
-sqlglot==4.6.0
+sqlglot==16.2.1
diff --git a/pyperformance/data-files/benchmarks/bm_sqlglot/run_benchmark.py b/pyperformance/data-files/benchmarks/bm_sqlglot/run_benchmark.py
@@ -1,6 +1,7 @@
 import pyperf
 
 from sqlglot import parse_one, transpile
+from sqlglot.executor import execute
 from sqlglot.optimizer import optimize, normalize
 
 
@@ -48,79 +49,79 @@
 
 TPCH_SCHEMA = {
     "lineitem": {
-        "l_orderkey": "uint64",
-        "l_partkey": "uint64",
-        "l_suppkey": "uint64",
-        "l_linenumber": "uint64",
-        "l_quantity": "float64",
-        "l_extendedprice": "float64",
-        "l_discount": "float64",
-        "l_tax": "float64",
+        "l_orderkey": "int",
+        "l_partkey": "int",
+        "l_suppkey": "int",
+        "l_linenumber": "int",
+        "l_quantity": "double",
+        "l_extendedprice": "double",
+        "l_discount": "double",
+        "l_tax": "double",
         "l_returnflag": "string",
         "l_linestatus": "string",
-        "l_shipdate": "date32",
-        "l_commitdate": "date32",
-        "l_receiptdate": "date32",
+        "l_shipdate": "date",
+        "l_commitdate": "date",
+        "l_receiptdate": "date",
         "l_shipinstruct": "string",
         "l_shipmode": "string",
         "l_comment": "string",
     },
     "orders": {
-        "o_orderkey": "uint64",
-        "o_custkey": "uint64",
+        "o_orderkey": "int",
+        "o_custkey": "int",
         "o_orderstatus": "string",
-        "o_totalprice": "float64",
-        "o_orderdate": "date32",
+        "o_totalprice": "double",
+        "o_orderdate": "date",
         "o_orderpriority": "string",
         "o_clerk": "string",
-        "o_shippriority": "int32",
+        "o_shippriority": "int",
         "o_comment": "string",
     },
     "customer": {
-        "c_custkey": "uint64",
+        "c_custkey": "int",
         "c_name": "string",
         "c_address": "string",
-        "c_nationkey": "uint64",
+        "c_nationkey": "int",
         "c_phone": "string",
-        "c_acctbal": "float64",
+        "c_acctbal": "double",
         "c_mktsegment": "string",
         "c_comment": "string",
     },
     "part": {
-        "p_partkey": "uint64",
+        "p_partkey": "int",
         "p_name": "string",
         "p_mfgr": "string",
         "p_brand": "string",
         "p_type": "string",
-        "p_size": "int32",
+        "p_size": "int",
         "p_container": "string",
-        "p_retailprice": "float64",
+        "p_retailprice": "double",
         "p_comment": "string",
     },
     "supplier": {
-        "s_suppkey": "uint64",
+        "s_suppkey": "int",
         "s_name": "string",
         "s_address": "string",
-        "s_nationkey": "uint64",
+        "s_nationkey": "int",
         "s_phone": "string",
-        "s_acctbal": "float64",
+        "s_acctbal": "double",
         "s_comment": "string",
     },
     "partsupp": {
-        "ps_partkey": "uint64",
-        "ps_suppkey": "uint64",
-        "ps_availqty": "int32",
-        "ps_supplycost": "float64",
+        "ps_partkey": "int",
+        "ps_suppkey": "int",
+        "ps_availqty": "int",
+        "ps_supplycost": "double",
         "ps_comment": "string",
     },
     "nation": {
-        "n_nationkey": "uint64",
+        "n_nationkey": "int",
         "n_name": "string",
-        "n_regionkey": "uint64",
+        "n_regionkey": "int",
         "n_comment": "string",
     },
     "region": {
-        "r_regionkey": "uint64",
+        "r_regionkey": "int",
         "r_name": "string",
         "r_comment": "string",
     },
@@ -164,10 +165,58 @@ def bench_normalize(loops):
     return elapsed
 
 
+def bench_execute(loops):
+    tables = {
+        "sushi": [
+            {"id": 1, "price": 1.0},
+            {"id": 2, "price": 2.0},
+            {"id": 3, "price": 3.0},
+        ],
+        "order_items": [
+            {"sushi_id": 1, "order_id": 1},
+            {"sushi_id": 1, "order_id": 1},
+            {"sushi_id": 2, "order_id": 1},
+            {"sushi_id": 3, "order_id": 2},
+        ],
+        "orders": [
+            {"id": 1, "user_id": 1},
+            {"id": 2, "user_id": 2},
+        ],
+    }
+
+    for i in range(10000):
+        tables["sushi"].append({"id": i, "price": i})
+        tables["order_items"].append({"sushi_id": i, "order_id": i})
+        tables["orders"].append({"id": i, "user_id": i})
+
+    elapsed = 0
+    for _ in range(loops):
+        t0 = pyperf.perf_counter()
+        execute(
+            """
+            SELECT
+              o.user_id,
+              s.price / 2 AS half_price,
+              AVG(s.price) AS avg_price,
+              SUM(s.price) AS price
+            FROM orders o
+            JOIN order_items i
+              ON o.id = i.order_id
+            JOIN sushi s
+              ON i.sushi_id = s.id
+            GROUP BY o.user_id
+            """,
+            tables=tables,
+        )
+        elapsed += pyperf.perf_counter() - t0
+    return elapsed
+
+
 if __name__ == "__main__":
     runner = pyperf.Runner()
     runner.metadata['description'] = "SQLGlot benchmark"
     runner.bench_time_func("sqlglot_parse", bench_parse)
     runner.bench_time_func("sqlglot_transpile", bench_transpile)
     runner.bench_time_func("sqlglot_optimize", bench_optimize)
     runner.bench_time_func("sqlglot_normalize", bench_normalize)
+    runner.bench_time_func("sqlglot_execute", bench_execute)