From 76c5609684d3eea2a29ed52b2aec390bc5c5a938 Mon Sep 17 00:00:00 2001
From: andheroe <freefunning@gmail.com>
Date: Sun, 28 Sep 2025 09:47:13 +0100
Subject: [PATCH] Add ClickBench benchmark + use Embucket experimental build

---
 benchmark/README.md                           | 117 +++++--
 benchmark/benchmark.py                        |   5 +-
 benchmark/clickbench/__init__.py              |  45 +++
 benchmark/clickbench/clickbench_ddl.py        | 134 +++++++
 benchmark/clickbench/clickbench_queries.py    | 328 ++++++++++++++++++
 .../clickbench/clickbench_table_names.py      |  62 ++++
 benchmark/data_preparation.py                 |  87 +++--
 benchmark/infrastructure/README.md            |  74 +++-
 benchmark/infrastructure/bootstrap.sh         |  44 +++
 benchmark/infrastructure/docker-compose.yml   |   2 +-
 benchmark/infrastructure/env.tpl              |   4 +
 benchmark/infrastructure/main.tf              |   2 +
 .../infrastructure/terraform.tfvars.example   |   5 +
 benchmark/infrastructure/variables.tf         |  13 +
 14 files changed, 855 insertions(+), 67 deletions(-)
 create mode 100644 benchmark/clickbench/__init__.py
 create mode 100644 benchmark/clickbench/clickbench_ddl.py
 create mode 100644 benchmark/clickbench/clickbench_queries.py
 create mode 100644 benchmark/clickbench/clickbench_table_names.py

diff --git a/benchmark/README.md b/benchmark/README.md
index 142485c42..29873a66b 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1,6 +1,6 @@
 ## Overview
 
-This benchmark tool executes queries derived from TPC-H against both Snowflake and Embucket with cache-clearing operations to ensure clean, cache-free performance measurements. For Snowflake, it uses warehouse suspend/resume operations. For Embucket, it restarts the Docker container before each query to eliminate internal caching. It provides detailed timing metrics including compilation time, execution time, and total elapsed time.
+This benchmark tool executes queries from multiple benchmark suites (TPC-H, ClickBench, TPC-DS) against both Snowflake and Embucket with cache-clearing operations to ensure clean, cache-free performance measurements. For Snowflake, it uses warehouse suspend/resume operations. For Embucket, it restarts the Docker container before each query to eliminate internal caching. It provides detailed timing metrics including compilation time, execution time, and total elapsed time.
 
 ## TPC Legal Considerations
 
@@ -14,9 +14,12 @@ Throughout this document and when talking about these benchmarks, you will see t
 
 ## Features
 
+- **Multiple Benchmark Types**: Supports TPC-H, ClickBench, and TPC-DS benchmark suites
 - **Cache Isolation**:
   - **Snowflake**: Suspends and resumes warehouse before each query
   - **Embucket**: Restarts Docker container before each query to clear internal cache
+- **Flexible Caching Options**: Can run with or without cache clearing (`--no-cache` flag)
+- **Command Line Interface**: Full CLI support for system selection, benchmark type, and run configuration
 - **Result Cache Disabled**: Ensures no result caching affects benchmark results
 - **Comprehensive Metrics**: Tracks compilation time, execution time, and row counts
 - **CSV Export**: Saves results to CSV files for further analysis
@@ -51,37 +54,79 @@ SNOWFLAKE_WAREHOUSE=your_warehouse
 
 **For Embucket (when using infrastructure):**
 ```bash
-EMBUCKET_SQL_HOST=your_ec2_instance_ip
-EMBUCKET_SQL_PORT=3000
-EMBUCKET_SQL_PROTOCOL=http
+EMBUCKET_HOST=your_ec2_instance_ip
+EMBUCKET_PORT=3000
+EMBUCKET_PROTOCOL=http
 EMBUCKET_USER=embucket
 EMBUCKET_PASSWORD=embucket
 EMBUCKET_ACCOUNT=embucket
-EMBUCKET_DATABASE=embucket
-EMBUCKET_SCHEMA=public
+EMBUCKET_DATABASE=benchmark_database
+EMBUCKET_SCHEMA=benchmark_schema
 EMBUCKET_INSTANCE=your_instance_name
-EMBUCKET_DATASET=your_dataset_name
 SSH_KEY_PATH=~/.ssh/id_rsa
 ```
 
+**Benchmark Configuration:**
+```bash
+BENCHMARK_TYPE=tpch  # Options: tpch, clickbench, tpcds
+DATASET_S3_BUCKET=embucket-testdata
+DATASET_PATH=tpch/01  # Path within S3 bucket
+SNOWFLAKE_WAREHOUSE_SIZE=XSMALL
+AWS_ACCESS_KEY_ID=your_aws_access_key_id
+AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key
+```
+
 ## Usage
 
-Run the benchmark:
+### Command Line Interface
+
+The benchmark supports comprehensive command-line options:
+
 ```bash
+# Run both Snowflake and Embucket with TPC-H (default)
 python benchmark.py
+
+# Run only Embucket with TPC-H
+python benchmark.py --system embucket
+
+# Run only Snowflake with TPC-H
+python benchmark.py --system snowflake
+
+# Run ClickBench on both systems
+python benchmark.py --benchmark-type clickbench
+
+# Run TPC-DS on Embucket only
+python benchmark.py --system embucket --benchmark-type tpcds
+
+# Run with caching enabled (no container restarts/warehouse suspends)
+python benchmark.py --system embucket
+
+# Run with caching disabled (force cache clearing)
+python benchmark.py --system embucket --no-cache
+
+# Custom number of runs and dataset path
+python benchmark.py --runs 5 --dataset-path tpch/100
 ```
 
-**Current Behavior**: By default, the benchmark runs **only Embucket** benchmarks for 3 iterations. To run both Snowflake and Embucket with comparisons, you need to modify the `__main__` section in `benchmark.py` to call `run_benchmark(i + 1)` instead of `run_embucket_benchmark(i + 1)`.
+### Command Line Arguments
+
+- `--system`: Choose platform (`snowflake`, `embucket`, `both`) - default: `both`
+- `--runs`: Number of benchmark runs - default: `3`
+- `--benchmark-type`: Benchmark suite (`tpch`, `clickbench`, `tpcds`) - default: `tpch`
+- `--dataset-path`: Override DATASET_PATH environment variable
+- `--no-cache`: Force cache clearing (warehouse suspend for Snowflake, container restart for Embucket)
+
+### Benchmark Process
 
 The benchmark will:
-1. Connect to the configured platform (Embucket by default, or both if modified)
-2. Execute each query derived from TPC-H with cache-clearing operations:
-   - **Snowflake**: Warehouse suspend/resume before each query
-   - **Embucket**: Docker container restart before each query
+1. Connect to the configured platform(s)
+2. Execute each query from the selected benchmark suite with cache-clearing operations:
+   - **Snowflake**: Warehouse suspend/resume before each query (if `--no-cache`)
+   - **Embucket**: Docker container restart before each query (if `--no-cache`)
 3. Collect performance metrics from query history
 4. Display results and comparisons (if both platforms are run)
 5. Save detailed results to CSV files
-6. Calculate averages after 3 runs are completed
+6. Calculate averages after all runs are completed
 
 ## Embucket Container Restart Functionality
 
@@ -95,8 +140,8 @@ For Embucket benchmarks, the system automatically restarts the Docker container
 - Creates a fresh database connection and executes the query
 
 **Requirements:**
-- `EMBUCKET_SQL_HOST` set to your EC2 instance IP
-- `EMBUCKET_INSTANCE` and `EMBUCKET_DATASET` for result organization
+- `EMBUCKET_HOST` set to your EC2 instance IP
+- `EMBUCKET_INSTANCE` for result organization
 - `SSH_KEY_PATH` pointing to your private key (default: `~/.ssh/id_rsa`)
 - SSH access to the EC2 instance running Embucket
 
@@ -115,16 +160,19 @@ The benchmark provides:
 - **Total Times**: Aggregated compilation and execution times
 
 **File Organization:**
-- Snowflake results: `snowflake_tpch_results/{schema}/{warehouse}/`
-- Embucket results: `embucket_tpch_results/{dataset}/{instance}/`
+- Snowflake results: `snowflake_{benchmark_type}_results/{schema}/{warehouse}/`
+- Embucket results: `embucket_{benchmark_type}_results/{dataset}/{instance}/`
+
+Where `{benchmark_type}` is one of: `tpch`, `clickbench`, or `tpcds`
 
 ## Files
 
 - `benchmark.py` - Main benchmark script with restart functionality
 - `docker_manager.py` - Docker container management for Embucket restarts
 - `utils.py` - Connection utilities for Snowflake and Embucket
-- `tpch_queries.py` - Query definitions derived from TPC-H
-- `tpcds_queries.py` - Query definitions derived from TPC-DS (for future use)
+- `tpch/` - TPC-H benchmark utilities package (queries, DDL, table names)
+- `clickbench/` - ClickBench benchmark utilities package (queries, DDL, table names)
+- `tpcds/` - TPC-DS benchmark utilities package (queries, DDL, table names)
 - `calculate_average.py` - Result averaging and analysis
 - `config.py` - Configuration utilities
 - `data_preparation.py` - Data preparation utilities
@@ -132,20 +180,27 @@ The benchmark provides:
 - `env_example` - Example environment configuration file
 - `infrastructure/` - Terraform infrastructure for EC2/Embucket deployment
 - `tpch-datagen/` - TPC-H data generation infrastructure
-- `tpch/` - TPC-H benchmark utilities package (queries, DDL, table names)
-- `tpcds_ddl/` - TPC-DS table definitions for Embucket
 
-## Customizing Benchmark Behavior
+## Benchmark Types
 
-**Default**: The benchmark runs only Embucket tests for 3 iterations.
+### TPC-H (Default)
+Derived from the TPC-H decision support benchmark. Includes 22 complex analytical queries testing various aspects of data warehousing performance.
 
-**To run both Snowflake and Embucket with comparisons**: Modify the `__main__` section in `benchmark.py`:
-```python
-if __name__ == "__main__":
-    for i in range(3):
-        print(f"Run {i + 1} of 3")
-        run_benchmark(i + 1)  # Change from run_embucket_benchmark(i + 1)
-```
+### ClickBench
+Single-table analytical benchmark focusing on aggregation performance. Uses the `hits` table with web analytics data.
+
+### TPC-DS
+Derived from the TPC-DS decision support benchmark. More complex than TPC-H with 99 queries testing advanced analytical scenarios.
+
+## Environment Variables
+
+The benchmark behavior can be controlled through environment variables in your `.env` file:
+
+- `BENCHMARK_TYPE`: Default benchmark type (`tpch`, `clickbench`, `tpcds`)
+- `DATASET_PATH`: Path within S3 bucket for dataset location
+- `DATASET_S3_BUCKET`: S3 bucket containing benchmark datasets
+- `EMBUCKET_HOST`: EC2 instance IP for Embucket connection
+- `SSH_KEY_PATH`: Path to SSH private key for container restarts
 
 ## Requirements
 
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 6926e4f00..a1f39435f 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -7,6 +7,7 @@
 from utils import create_snowflake_connection
 from utils import create_embucket_connection
 from tpch import parametrize_tpch_queries
+from clickbench import parametrize_clickbench_queries
 from docker_manager import create_docker_manager
 from constants import SystemType
 
@@ -286,6 +287,8 @@ def get_queries_for_benchmark(benchmark_type: str, for_embucket: bool) -> List[T
     """Get appropriate queries based on the benchmark type."""
     if benchmark_type == "tpch":
         return parametrize_tpch_queries(fully_qualified_names_for_embucket=for_embucket)
+    elif benchmark_type == "clickbench":
+        return parametrize_clickbench_queries(fully_qualified_names_for_embucket=for_embucket)
     elif benchmark_type == "tpcds":
         raise NotImplementedError("TPC-DS benchmarks not yet implemented")
     else:
@@ -433,7 +436,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Run benchmarks on Snowflake and/or Embucket")
     parser.add_argument("--system", choices=["snowflake", "embucket", "both"], default="both")
     parser.add_argument("--runs", type=int, default=3)
-    parser.add_argument("--benchmark-type", choices=["tpch", "tpcds"], default=os.environ.get("BENCHMARK_TYPE", "tpch"))
+    parser.add_argument("--benchmark-type", choices=["tpch", "clickbench", "tpcds"], default=os.environ.get("BENCHMARK_TYPE", "tpch"))
     parser.add_argument("--dataset-path", help="Override the DATASET_PATH environment variable")
     parser.add_argument("--no-cache", action="store_true", help="Disable caching (force warehouse suspend and USE_CACHED_RESULT=False for Snowflake, force container restart for Embucket)")
     return parser.parse_args()
diff --git a/benchmark/clickbench/__init__.py b/benchmark/clickbench/__init__.py
new file mode 100644
index 000000000..f62b41e5a
--- /dev/null
+++ b/benchmark/clickbench/__init__.py
@@ -0,0 +1,45 @@
+"""
+ClickBench benchmark utilities package.
+
+This package contains all ClickBench related functionality including:
+- Table name configuration and parametrization
+- Query definitions with parametrized table names
+- DDL statements with parametrized table names
+
+Main exports:
+- parametrize_clickbench_queries: Parametrize ClickBench queries (requires explicit parameter)
+- parametrize_clickbench_ddl: Parametrize ClickBench DDL statements (requires explicit parameter)
+- CLICKBENCH_TABLE_NAMES: Raw table name mappings
+- get_table_names: Get parametrized table names (requires explicit parameter)
+- parametrize_clickbench_statements: Generic parametrization function (requires explicit parameter)
+
+Note: All functions require explicit fully_qualified_names_for_embucket parameter.
+No pre-computed constants are provided to enforce explicit parameter usage.
+"""
+
+from .clickbench_table_names import (
+    CLICKBENCH_TABLE_NAMES,
+    get_table_names,
+    parametrize_clickbench_statements
+)
+
+from .clickbench_queries import (
+    parametrize_clickbench_queries,
+)
+
+from .clickbench_ddl import (
+    parametrize_clickbench_ddl,
+)
+
+__all__ = [
+    # Table names and core functions
+    'CLICKBENCH_TABLE_NAMES',
+    'get_table_names',
+    'parametrize_clickbench_statements',
+
+    # Query functions
+    'parametrize_clickbench_queries',
+
+    # DDL functions
+    'parametrize_clickbench_ddl',
+]
\ No newline at end of file
diff --git a/benchmark/clickbench/clickbench_ddl.py b/benchmark/clickbench/clickbench_ddl.py
new file mode 100644
index 000000000..8125f909a
--- /dev/null
+++ b/benchmark/clickbench/clickbench_ddl.py
@@ -0,0 +1,134 @@
+import os
+
+from .clickbench_table_names import parametrize_clickbench_statements
+
+# ClickBench DDL statement with parametrized table name
+_CLICKBENCH_DDL_RAW = [
+    (
+        "hits",
+        """
+        -- Snowflake-like DDL for ClickBench hits table
+        CREATE OR REPLACE TABLE {HITS_TABLE} (
+          WatchID BIGINT,
+          JavaEnable SMALLINT,
+          Title VARCHAR,
+          GoodEvent SMALLINT,
+          EventTime BIGINT,
+          EventDate SMALLINT,
+          CounterID INTEGER,
+          ClientIP INTEGER,
+          RegionID INTEGER,
+          UserID BIGINT,
+          CounterClass SMALLINT,
+          OS SMALLINT,
+          UserAgent SMALLINT,
+          URL VARCHAR,
+          Referer VARCHAR,
+          IsRefresh SMALLINT,
+          RefererCategoryID SMALLINT,
+          RefererRegionID INTEGER,
+          URLCategoryID SMALLINT,
+          URLRegionID INTEGER,
+          ResolutionWidth SMALLINT,
+          ResolutionHeight SMALLINT,
+          ResolutionDepth SMALLINT,
+          FlashMajor SMALLINT,
+          FlashMinor SMALLINT,
+          FlashMinor2 VARCHAR,
+          NetMajor SMALLINT,
+          NetMinor SMALLINT,
+          UserAgentMajor SMALLINT,
+          UserAgentMinor VARCHAR,
+          CookieEnable SMALLINT,
+          JavascriptEnable SMALLINT,
+          IsMobile SMALLINT,
+          MobilePhone SMALLINT,
+          MobilePhoneModel VARCHAR,
+          Params VARCHAR,
+          IPNetworkID INTEGER,
+          TraficSourceID SMALLINT,
+          SearchEngineID SMALLINT,
+          SearchPhrase VARCHAR,
+          AdvEngineID SMALLINT,
+          IsArtifical SMALLINT,
+          WindowClientWidth SMALLINT,
+          WindowClientHeight SMALLINT,
+          ClientTimeZone SMALLINT,
+          ClientEventTime BIGINT,
+          SilverlightVersion1 SMALLINT,
+          SilverlightVersion2 SMALLINT,
+          SilverlightVersion3 INTEGER,
+          SilverlightVersion4 SMALLINT,
+          PageCharset VARCHAR,
+          CodeVersion INTEGER,
+          IsLink SMALLINT,
+          IsDownload SMALLINT,
+          IsNotBounce SMALLINT,
+          FUniqID BIGINT,
+          OriginalURL VARCHAR,
+          HID INTEGER,
+          IsOldCounter SMALLINT,
+          IsEvent SMALLINT,
+          IsParameter SMALLINT,
+          DontCountHits SMALLINT,
+          WithHash SMALLINT,
+          HitColor VARCHAR,
+          LocalEventTime BIGINT,
+          Age SMALLINT,
+          Sex SMALLINT,
+          Income SMALLINT,
+          Interests SMALLINT,
+          Robotness SMALLINT,
+          RemoteIP INTEGER,
+          WindowName INTEGER,
+          OpenerName INTEGER,
+          HistoryLength SMALLINT,
+          BrowserLanguage VARCHAR,
+          BrowserCountry VARCHAR,
+          SocialNetwork VARCHAR,
+          SocialAction VARCHAR,
+          HTTPError SMALLINT,
+          SendTiming INTEGER,
+          DNSTiming INTEGER,
+          ConnectTiming INTEGER,
+          ResponseStartTiming INTEGER,
+          ResponseEndTiming INTEGER,
+          FetchTiming INTEGER,
+          SocialSourceNetworkID SMALLINT,
+          SocialSourcePage VARCHAR,
+          ParamPrice BIGINT,
+          ParamOrderID VARCHAR,
+          ParamCurrency VARCHAR,
+          ParamCurrencyID SMALLINT,
+          OpenstatServiceName VARCHAR,
+          OpenstatCampaignID VARCHAR,
+          OpenstatAdID VARCHAR,
+          OpenstatSourceID VARCHAR,
+          UTMSource VARCHAR,
+          UTMMedium VARCHAR,
+          UTMCampaign VARCHAR,
+          UTMContent VARCHAR,
+          UTMTerm VARCHAR,
+          FromTag VARCHAR,
+          HasGCLID SMALLINT,
+          RefererHash BIGINT,
+          URLHash BIGINT,
+          CLID INTEGER
+        );
+        """
+    ),
+]
+
+
+def parametrize_clickbench_ddl(fully_qualified_names_for_embucket):
+    """
+    Replace table name placeholders in ClickBench DDL statements with actual table names.
+
+    Args:
+        fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format.
+                                                   If False, use just the default table names.
+
+    Returns:
+        list: A list of (table_name, parametrized_ddl) tuples.
+    """
+    return parametrize_clickbench_statements(_CLICKBENCH_DDL_RAW, fully_qualified_names_for_embucket)
diff --git a/benchmark/clickbench/clickbench_queries.py b/benchmark/clickbench/clickbench_queries.py
new file mode 100644
index 000000000..0d6460022
--- /dev/null
+++ b/benchmark/clickbench/clickbench_queries.py
@@ -0,0 +1,328 @@
+import os
+import re
+
+from .clickbench_table_names import parametrize_clickbench_statements
+
+# Original ClickBench queries with parametrized table names
+_CLICKBENCH_QUERIES_RAW = [
+    (
+        "clickbench-q0",
+        """
+        SELECT COUNT(*) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q1",
+        """
+        SELECT COUNT(*) FROM {HITS_TABLE} WHERE AdvEngineID <> 0;
+        """
+    ),
+    (
+        "clickbench-q2",
+        """
+        SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q3",
+        """
+        SELECT AVG(UserID) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q4",
+        """
+        SELECT COUNT(DISTINCT UserID) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q5",
+        """
+        SELECT COUNT(DISTINCT SearchPhrase) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q6",
+        """
+        SELECT MIN(EventDate), MAX(EventDate) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q7",
+        """
+        SELECT AdvEngineID, COUNT(*) FROM {HITS_TABLE} WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
+        """
+    ),
+    (
+        "clickbench-q8",
+        """
+        SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q9",
+        """
+        SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM {HITS_TABLE} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q10",
+        """
+        SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q11",
+        """
+        SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q12",
+        """
+        SELECT SearchPhrase, COUNT(*) AS c FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q13",
+        """
+        SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q14",
+        """
+        SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q15",
+        """
+        SELECT UserID, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q16",
+        """
+        SELECT UserID, SearchPhrase, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q17",
+        """
+        SELECT UserID, SearchPhrase, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID, SearchPhrase LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q18",
+        """
+        SELECT
+          UserID,
+          minute(to_timestamp(from_unixtime(EventTime))) AS m,
+          SearchPhrase,
+          COUNT(*) AS cnt
+        FROM {HITS_TABLE}
+        GROUP BY UserID, m, SearchPhrase
+        ORDER BY cnt DESC
+        LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q19",
+        """
+        SELECT UserID FROM {HITS_TABLE} WHERE UserID = 435090932899640449;
+        """
+    ),
+    (
+        "clickbench-q20",
+        """
+        SELECT COUNT(*) FROM {HITS_TABLE} WHERE URL LIKE '%google%';
+        """
+    ),
+    (
+        "clickbench-q21",
+        """
+        SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM {HITS_TABLE} WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q22",
+        """
+        SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM {HITS_TABLE} WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q23",
+        """
+        SELECT * FROM {HITS_TABLE} WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q24",
+        """
+        SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q25",
+        """
+        SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q26",
+        """
+        SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q27",
+        """
+        SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM {HITS_TABLE} WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+        """
+    ),
+    (
+        "clickbench-q28",
+        """
+        SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM {HITS_TABLE} WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+        """
+    ),
+    (
+        "clickbench-q29",
+        """
+        SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM {HITS_TABLE};
+        """
+    ),
+    (
+        "clickbench-q30",
+        """
+        SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q31",
+        """
+        SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q32",
+        """
+        SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q33",
+        """
+        SELECT URL, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY URL ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q34",
+        """
+        SELECT 1, URL, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q35",
+        """
+        SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q36",
+        """
+        SELECT URL, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q37",
+        """
+        SELECT Title, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
+        """
+    ),
+    (
+        "clickbench-q38",
+        """
+        SELECT URL, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+        """
+    ),
+    (
+        "clickbench-q39",
+        """
+        SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+        """
+    ),
+    (
+        "clickbench-q40",
+        """
+        SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
+        """
+    ),
+    (
+        "clickbench-q41",
+        """
+        SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
+        """
+    ),
+    (
+        "clickbench-q42",
+        """
+        WITH ts AS (
+      SELECT
+        -- minute bucket
+        DATE_TRUNC('minute',
+          CAST(
+            FROM_UNIXTIME(
+              CAST(
+                CASE
+                  WHEN EventTime > 20000000000 THEN FLOOR(EventTime / 1000)  -- ms → s
+                  ELSE EventTime                                             -- seconds
+                END AS BIGINT
+              )
+            ) AS TIMESTAMP
+          )
+        ) AS m,
+        -- day for filtering
+        CAST(
+          DATE_TRUNC('day',
+            CAST(
+              FROM_UNIXTIME(
+                CAST(
+                  CASE
+                    WHEN EventTime > 20000000000 THEN FLOOR(EventTime / 1000)
+                    ELSE EventTime
+                  END AS BIGINT
+                )
+              ) AS TIMESTAMP
+            )
+          ) AS DATE
+        ) AS d
+      FROM {HITS_TABLE}
+      WHERE CounterID = 62
+        AND IsRefresh = 0
+        AND DontCountHits = 0
+    )
+    SELECT m, COUNT(*) AS PageViews
+    FROM ts
+    WHERE d BETWEEN DATE '2013-07-14' AND DATE '2013-07-15'
+    GROUP BY m
+    ORDER BY m
+    LIMIT 10 OFFSET 1000;
+        """
+    ),
+]
+
+
+def parametrize_clickbench_queries(fully_qualified_names_for_embucket):
+    """
+    Replace table name placeholders in ClickBench queries with actual table names.
+
+    Args:
+        fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format.
+                                                   If False, use just the default table names.
+
+    Returns:
+        list: A list of (query_name, parametrized_query) tuples.
+    """
+    return parametrize_clickbench_statements(_CLICKBENCH_QUERIES_RAW, fully_qualified_names_for_embucket)
diff --git a/benchmark/clickbench/clickbench_table_names.py b/benchmark/clickbench/clickbench_table_names.py
new file mode 100644
index 000000000..3c1a2b9b2
--- /dev/null
+++ b/benchmark/clickbench/clickbench_table_names.py
@@ -0,0 +1,62 @@
+"""
+ClickBench table names configuration.
+
+This module defines the single ClickBench table name and its
+corresponding placeholder name used for parametrization.
+"""
+
+# The single ClickBench table name with its parametrization placeholder
+CLICKBENCH_TABLE_NAMES = {
+    'HITS_TABLE': 'hits'
+}
+
+def get_table_names(fully_qualified_names_for_embucket):
+    """
+    Get table names dictionary with optional fully qualified naming.
+
+    Args:
+        fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format.
+                                                   If False, use just the default table names.
+
+    Returns:
+        dict: Dictionary mapping placeholder names to actual table names.
+    """
+    import os
+
+    table_names = CLICKBENCH_TABLE_NAMES.copy()
+
+    if fully_qualified_names_for_embucket:
+        # Get database and schema from environment variables
+        database = os.environ['EMBUCKET_DATABASE']
+        schema = os.environ['EMBUCKET_SCHEMA']
+
+        # Create fully qualified table names
+        for key, table_name in table_names.items():
+            table_names[key] = f"{database}.{schema}.{table_name}"
+
+    return table_names
+
+
+def parametrize_clickbench_statements(statements_raw, fully_qualified_names_for_embucket):
+    """
+    Generic function to parametrize ClickBench statements (queries or DDL) with table names.
+
+    Args:
+        statements_raw (list): List of (name, statement_sql) tuples with placeholder table names.
+        fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format.
+                                                   If False, use just the default table names.
+
+    Returns:
+        list: A list of (name, parametrized_statement) tuples.
+    """
+    # Get table names with appropriate formatting
+    table_names = get_table_names(fully_qualified_names_for_embucket)
+
+    parametrized_statements = []
+
+    for name, statement_sql in statements_raw:
+        # Replace table name placeholders
+        parametrized_sql = statement_sql.format(**table_names)
+        parametrized_statements.append((name, parametrized_sql))
+
+    return parametrized_statements
diff --git a/benchmark/data_preparation.py b/benchmark/data_preparation.py
index b2cd9203c..7691846c4 100644
--- a/benchmark/data_preparation.py
+++ b/benchmark/data_preparation.py
@@ -1,31 +1,51 @@
 import os
 import argparse
 from utils import create_embucket_connection, create_snowflake_connection
-from tpch import parametrize_tpch_ddl, get_table_names
+from tpch import parametrize_tpch_ddl, get_table_names as get_tpch_table_names
+from clickbench import parametrize_clickbench_ddl, get_table_names as get_clickbench_table_names
 from dotenv import load_dotenv
 from constants import SystemType
 
 load_dotenv()
 
 
-def create_tables(cursor, system):
-    """Create tables using the consolidated TPC-H DDL statements."""
-    print(f"Creating tables for {system}...")
-    # Get DDL statements with fully qualified/unqualified names for Embucket/Snowflake
-    if system == SystemType.EMBUCKET:
-        tpch_ddl = parametrize_tpch_ddl(fully_qualified_names_for_embucket=True)
-    elif system == SystemType.SNOWFLAKE:
-        tpch_ddl = parametrize_tpch_ddl(fully_qualified_names_for_embucket=False)
+def create_tables(cursor, system, benchmark_type):
+    """Create tables using the appropriate DDL statements based on benchmark type."""
+    print(f"Creating tables for {system} ({benchmark_type})...")
+
+    # Get DDL statements based on benchmark type and system
+    if benchmark_type == "tpch":
+        if system == SystemType.EMBUCKET:
+            ddl_statements = parametrize_tpch_ddl(fully_qualified_names_for_embucket=True)
+        elif system == SystemType.SNOWFLAKE:
+            ddl_statements = parametrize_tpch_ddl(fully_qualified_names_for_embucket=False)
+        else:
+            raise ValueError("Unsupported system")
+    elif benchmark_type == "clickbench":
+        if system == SystemType.EMBUCKET:
+            ddl_statements = parametrize_clickbench_ddl(fully_qualified_names_for_embucket=True)
+        elif system == SystemType.SNOWFLAKE:
+            ddl_statements = parametrize_clickbench_ddl(fully_qualified_names_for_embucket=False)
+        else:
+            raise ValueError("Unsupported system")
     else:
-        raise ValueError("Unsupported system")
-    for table_name, ddl_sql in tpch_ddl:
+        raise ValueError(f"Unsupported benchmark type: {benchmark_type}")
+
+    for table_name, ddl_sql in ddl_statements:
         print(f"Creating table: {table_name}")
         cursor.execute(ddl_sql.strip())
 
 
-def upload_parquet_to_snowflake_tables(cursor, dataset_path):
+def upload_parquet_to_snowflake_tables(cursor, dataset_path, benchmark_type):
     """Upload parquet files to Snowflake tables from S3 stage."""
-    table_names = get_table_names(fully_qualified_names_for_embucket=False)
+    # Get table names based on benchmark type
+    if benchmark_type == "tpch":
+        table_names = get_tpch_table_names(fully_qualified_names_for_embucket=False)
+    elif benchmark_type == "clickbench":
+        table_names = get_clickbench_table_names(fully_qualified_names_for_embucket=False)
+    else:
+        raise ValueError(f"Unsupported benchmark type: {benchmark_type}")
+
     for table_name in table_names.values():
         print(f"Loading data into Snowflake table {table_name}...")
         s3_path = f"s3://embucket-testdata/{dataset_path}/{table_name}.parquet"
@@ -43,10 +63,15 @@ def upload_parquet_to_snowflake_tables(cursor, dataset_path):
 
 
 
-def upload_parquet_to_embucket_tables(cursor, dataset_path):
+def upload_parquet_to_embucket_tables(cursor, dataset_path, benchmark_type):
     """Upload parquet files to Embucket tables using COPY INTO."""
-    # Get fully qualified table names using the unified logic
-    table_names = get_table_names(fully_qualified_names_for_embucket=True)
+    # Get fully qualified table names based on benchmark type
+    if benchmark_type == "tpch":
+        table_names = get_tpch_table_names(fully_qualified_names_for_embucket=True)
+    elif benchmark_type == "clickbench":
+        table_names = get_clickbench_table_names(fully_qualified_names_for_embucket=True)
+    else:
+        raise ValueError(f"Unsupported benchmark type: {benchmark_type}")
 
     for placeholder, qualified_table_name in table_names.items():
         # Extract bare table name for the S3 path (parquet files use bare names)
@@ -57,49 +82,55 @@ def upload_parquet_to_embucket_tables(cursor, dataset_path):
         cursor.execute(copy_sql)
 
 
-def prepare_data_for_embucket(dataset_path):
+def prepare_data_for_embucket(dataset_path, benchmark_type):
     """Prepare data for Embucket: generate data, create tables, and load data."""
     # Connect to Embucket
     cursor = create_embucket_connection().cursor()
     # Create tables
-    create_tables(cursor, SystemType.EMBUCKET)
+    create_tables(cursor, SystemType.EMBUCKET, benchmark_type)
     # Load data into Embucket tables
-    upload_parquet_to_embucket_tables(cursor, dataset_path)
+    upload_parquet_to_embucket_tables(cursor, dataset_path, benchmark_type)
 
     cursor.close()
-    print("Embucket data preparation completed successfully.")
+    print(f"Embucket data preparation completed successfully for {benchmark_type}.")
 
 
-def prepare_data_for_snowflake(dataset_path):
+def prepare_data_for_snowflake(dataset_path, benchmark_type):
     """Prepare data, create tables, and load data for Snowflake"""
     # Connect to Snowflake
     cursor = create_snowflake_connection().cursor()
     # Create tables
-    create_tables(cursor, SystemType.SNOWFLAKE)
+    create_tables(cursor, SystemType.SNOWFLAKE, benchmark_type)
     # Load data into Snowflake tables
-    upload_parquet_to_snowflake_tables(cursor, dataset_path)
+    upload_parquet_to_snowflake_tables(cursor, dataset_path, benchmark_type)
 
     cursor.close()
-    print("Snowflake data preparation completed successfully.")
+    print(f"Snowflake data preparation completed successfully for {benchmark_type}.")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepare data for Embucket/Snowflake benchmarks")
     parser.add_argument("--system", type=str, choices=["embucket", "snowflake", "both"],
                         default="both", help="Which system to prepare data for")
+    parser.add_argument("--benchmark-type", type=str, choices=["tpch", "clickbench"],
+                        default=os.environ.get("BENCHMARK_TYPE", "tpch"),
+                        help="Benchmark type (default: from env or 'tpch')")
     parser.add_argument("--dataset-path", type=str, default=os.environ.get("DATASET_PATH", "tpch/1"),
                         help="Dataset path in format 'dataset/scale' (default: from env or 'tpch/1')")
 
     args = parser.parse_args()
 
-    # Override environment variable if specified in args
+    # Override environment variables if specified in args
     if args.dataset_path:
         os.environ["DATASET_PATH"] = args.dataset_path
+    if args.benchmark_type != os.environ.get("BENCHMARK_TYPE", "tpch"):
+        os.environ["BENCHMARK_TYPE"] = args.benchmark_type
 
-    print(f"Preparing data for dataset path: {args.dataset_path}")
+    print(f"Preparing data for benchmark type: {args.benchmark_type}")
+    print(f"Dataset path: {args.dataset_path}")
 
     if args.system.lower() in ["embucket", "both"]:
-        prepare_data_for_embucket(args.dataset_path)
+        prepare_data_for_embucket(args.dataset_path, args.benchmark_type)
 
     if args.system.lower() in ["snowflake", "both"]:
-        prepare_data_for_snowflake(args.dataset_path)
+        prepare_data_for_snowflake(args.dataset_path, args.benchmark_type)
diff --git a/benchmark/infrastructure/README.md b/benchmark/infrastructure/README.md
index b65d1bd2e..1db00519f 100644
--- a/benchmark/infrastructure/README.md
+++ b/benchmark/infrastructure/README.md
@@ -29,6 +29,8 @@ All resources share the same 8-character random suffix (e.g., `a1b2c3d4`), makin
    **Required variables to set:**
    - `benchmark_s3_user_key_id` - Your AWS Access Key ID
    - `benchmark_s3_user_access_key` - Your AWS Secret Access Key
+   - `github_username` - Your GitHub username
+   - `github_token` - GitHub Personal Access Token with 'read:packages' permission
    - `private_key_path` / `public_key_path` - SSH key paths
 
 3. **Deploy Infrastructure**
@@ -126,6 +128,28 @@ Your AWS user needs a policy that allows access to S3 buckets with the `embucket
 
 This policy allows multiple team members to use the same credentials while each deployment creates its own unique bucket.
 
+### GitHub Container Registry Authentication
+
+The infrastructure automatically authenticates with GitHub Container Registry (ghcr.io) to pull the Embucket Docker image. This requires:
+
+1. **GitHub Personal Access Token**: Create a token with `read:packages` permission
+   - Go to GitHub Settings → Developer settings → Personal access tokens → Tokens (classic)
+   - Generate new token with `read:packages` scope
+   - Copy the token value
+
+2. **Configuration**: Add your GitHub credentials to `terraform.tfvars`:
+   ```hcl
+   github_username = "your-github-username"
+   github_token    = "ghp_your_personal_access_token"
+   ```
+
+3. **Automatic Authentication**: During deployment, the bootstrap script will:
+   - Extract GitHub credentials from the `.env` file
+   - Authenticate with `ghcr.io` using `docker login`
+   - Pull the `ghcr.io/embucket/embucket:experimental` image
+
+**Note**: If GitHub credentials are not provided, the deployment will skip ghcr.io authentication and may fail when trying to pull the Embucket image.
+
 ### Team Collaboration
 
 - **Shared Credentials**: All team members can use the same AWS user credentials
@@ -176,9 +200,34 @@ Run validation manually:
 Once deployed, you can:
 1. SSH to the instance: `$(terraform output -raw ssh_command)`
 2. Run custom benchmarks against the Embucket API
-3. Monitor performance using AWS CloudWatch
+3. Monitor performance using AWS CloudWatch or the included monitoring script
 4. Scale instance type up/down as needed
 
+### System Monitoring
+
+The infrastructure includes a monitoring script to track system and Embucket container performance:
+
+```bash
+# Real-time monitoring (default)
+./monitor_ram.sh
+
+# Monitor every 5 seconds
+./monitor_ram.sh -i 5
+
+# Monitor for 60 seconds then exit
+./monitor_ram.sh -t 60
+
+# Single snapshot
+./monitor_ram.sh -o
+```
+
+The monitoring script displays:
+- **System RAM**: Memory usage percentage and absolute values
+- **System CPU**: CPU usage percentage across all cores
+- **Embucket Container**: Container-specific RAM and CPU usage
+
+This is particularly useful during benchmarking to ensure the system has adequate resources and to identify performance bottlenecks.
+
 ## Cleanup
 
 To destroy all resources:
@@ -195,6 +244,7 @@ terraform destroy
 - `user_data.sh` - EC2 initialization script
 - `bootstrap.sh` - Embucket installation and startup script
 - `validate.sh` - Deployment validation script
+- `monitor_ram.sh` - System and Embucket container monitoring script
 - `docker-compose.yml` - Embucket container configuration
 - `env.tpl` - Environment template for Embucket configuration
 - `docker/db_init.py` - Database initialization script
@@ -209,7 +259,9 @@ terraform destroy
 3. **Database initialization failed**: Verify S3 credentials and permissions
 4. **S3 access denied**: Ensure your AWS user has the required S3 policy (see above)
 5. **Empty credentials**: Make sure you've set `benchmark_s3_user_key_id` and `benchmark_s3_user_access_key` in `terraform.tfvars`
-6. **Performance issues**: Consider upgrading instance type
+6. **GitHub authentication failed**: Verify `github_username` and `github_token` are set correctly
+7. **Docker image pull failed**: Check GitHub token has `read:packages` permission
+8. **Performance issues**: Consider upgrading instance type
 
 ### Apple Silicon Troubleshooting
 
@@ -237,26 +289,36 @@ docker-compose ps
 docker-compose logs embucket
 docker-compose logs db-init
 
+# Check GitHub authentication
+docker images | grep ghcr.io
+docker login ghcr.io  # Test manual login
+
 # Check system resources
 htop
 df -h
 
 # If credentials weren't provided during deployment, you can set them up manually:
-# Edit the .env file with your AWS credentials and restart Embucket
+# Edit the .env file with your AWS and GitHub credentials and restart Embucket
 nano .env
 docker-compose up -d
 ```
 
 ### Manual Credential Setup
 
-If you didn't provide AWS credentials in `terraform.tfvars`, you can set them up after deployment:
+If you didn't provide AWS or GitHub credentials in `terraform.tfvars`, you can set them up after deployment:
 
 1. SSH to the instance: `$(terraform output -raw ssh_command)`
-2. Edit the `.env` file to add your AWS credentials:
+2. Edit the `.env` file to add your credentials:
    ```bash
    nano .env
    # Add or update these lines:
    # AWS_ACCESS_KEY_ID=your-access-key-id
    # AWS_SECRET_ACCESS_KEY=your-secret-access-key
+   # GITHUB_USERNAME=your-github-username
+   # GITHUB_TOKEN=your-github-token
+   ```
+3. Authenticate with GitHub Container Registry:
+   ```bash
+   echo "$GITHUB_TOKEN" | docker login ghcr.io -u "$GITHUB_USERNAME" --password-stdin
    ```
-3. Start Embucket: `docker-compose up -d`
+4. Start Embucket: `docker-compose up -d`
diff --git a/benchmark/infrastructure/bootstrap.sh b/benchmark/infrastructure/bootstrap.sh
index ecd09367b..278536384 100644
--- a/benchmark/infrastructure/bootstrap.sh
+++ b/benchmark/infrastructure/bootstrap.sh
@@ -62,6 +62,23 @@ fi
 # Verify credentials are in .env file and not empty
 if grep -q "AWS_ACCESS_KEY_ID=" .env && [ "$(grep AWS_ACCESS_KEY_ID= .env | cut -d= -f2)" != "" ]; then
     echo "✅ AWS credentials found in .env file"
+
+    # Authenticate with GitHub Container Registry if credentials are provided
+    if grep -q "GITHUB_TOKEN=" .env && [ "$(grep GITHUB_TOKEN= .env | cut -d= -f2)" != "" ]; then
+        echo "🔐 Authenticating with GitHub Container Registry..."
+        GITHUB_TOKEN=$(grep GITHUB_TOKEN= .env | cut -d= -f2)
+        GITHUB_USERNAME=$(grep GITHUB_USERNAME= .env | cut -d= -f2)
+        echo "$GITHUB_TOKEN" | sudo -u ec2-user docker login ghcr.io -u "$GITHUB_USERNAME" --password-stdin
+        if [ $? -eq 0 ]; then
+            echo "✅ Successfully authenticated with ghcr.io"
+        else
+            echo "❌ Failed to authenticate with ghcr.io"
+            exit 1
+        fi
+    else
+        echo "⚠️  No GitHub credentials found - skipping ghcr.io authentication"
+    fi
+
     echo "========================================="
     echo "Starting Embucket with automatic database initialization..."
     echo "Running: docker-compose up -d"
@@ -102,9 +119,36 @@ for i in {1..30}; do
         break
     fi
     echo "Attempt $i/30: Waiting for Embucket API..."
+
+    # Show container logs if API is not ready after several attempts
+    if [ $i -eq 5 ] || [ $i -eq 15 ] || [ $i -eq 25 ]; then
+        echo "========================================="
+        echo "🔍 Container status and logs (attempt $i):"
+        sudo -u ec2-user docker-compose ps
+        echo ""
+        echo "📋 Embucket container logs (last 20 lines):"
+        sudo -u ec2-user docker-compose logs --tail=20 embucket
+        echo "========================================="
+    fi
+
     sleep 10
 done
 
+# If health check failed, show detailed logs
+if ! curl -s http://localhost:3000/health > /dev/null 2>&1; then
+    echo "❌ Embucket API health check failed after 30 attempts"
+    echo "========================================="
+    echo "🔍 Final container status:"
+    sudo -u ec2-user docker-compose ps
+    echo ""
+    echo "📋 Full Embucket container logs:"
+    sudo -u ec2-user docker-compose logs embucket
+    echo ""
+    echo "📋 Database init container logs:"
+    sudo -u ec2-user docker-compose logs db-init
+    echo "========================================="
+fi
+
 # Check if database initialization was successful
 echo "Checking database initialization..."
 sleep 30
diff --git a/benchmark/infrastructure/docker-compose.yml b/benchmark/infrastructure/docker-compose.yml
index 35c4dcc9f..253ed4616 100644
--- a/benchmark/infrastructure/docker-compose.yml
+++ b/benchmark/infrastructure/docker-compose.yml
@@ -1,6 +1,6 @@
 services:
   embucket:
-    image: embucket/embucket:0.2.0
+    image: ghcr.io/embucket/embucket:experimental
     container_name: embucket-benchmark
     environment:
       # Iceberg Catalog settings
diff --git a/benchmark/infrastructure/env.tpl b/benchmark/infrastructure/env.tpl
index c31af4db3..1b6be4600 100644
--- a/benchmark/infrastructure/env.tpl
+++ b/benchmark/infrastructure/env.tpl
@@ -14,3 +14,7 @@ SCHEMA_NAME=benchmark_schema
 VOLUME_NAME=benchmark_volume
 EMBUCKET_HOST=localhost
 EMBUCKET_PORT=3000
+
+# GitHub Container Registry credentials
+GITHUB_TOKEN=${github_token}
+GITHUB_USERNAME=${github_username}
diff --git a/benchmark/infrastructure/main.tf b/benchmark/infrastructure/main.tf
index 5897b3b81..f1fe99203 100644
--- a/benchmark/infrastructure/main.tf
+++ b/benchmark/infrastructure/main.tf
@@ -166,6 +166,8 @@ resource "local_file" "env_file" {
     vite_api_url         = "http://${aws_instance.embucket_benchmark.public_ip}:3000"
     instance_public_ip   = aws_instance.embucket_benchmark.public_ip
     private_key_path     = var.private_key_path
+    github_token         = var.github_token
+    github_username      = var.github_username
   })
   filename = "${path.module}/.env"
 }
diff --git a/benchmark/infrastructure/terraform.tfvars.example b/benchmark/infrastructure/terraform.tfvars.example
index e46e194d8..9a4dd3de7 100644
--- a/benchmark/infrastructure/terraform.tfvars.example
+++ b/benchmark/infrastructure/terraform.tfvars.example
@@ -28,3 +28,8 @@ environment = "benchmark"
 # that have S3 access to the benchmark bucket
 benchmark_s3_user_key_id     = ""  # Your existing AWS user's access key ID
 benchmark_s3_user_access_key = ""  # Your existing AWS user's secret access key
+
+# GitHub Container Registry Authentication
+# Required for pulling from ghcr.io/embucket/embucket:experimental
+github_username = ""  # Your GitHub username
+github_token    = ""  # GitHub Personal Access Token with 'read:packages' permission
diff --git a/benchmark/infrastructure/variables.tf b/benchmark/infrastructure/variables.tf
index 7a0d14900..aecea80f0 100644
--- a/benchmark/infrastructure/variables.tf
+++ b/benchmark/infrastructure/variables.tf
@@ -60,3 +60,16 @@ variable "aws_profile" {
   type        = string
   default     = null
 }
+
+variable "github_token" {
+  description = "GitHub Personal Access Token for pulling from ghcr.io"
+  type        = string
+  default     = ""
+  sensitive   = true
+}
+
+variable "github_username" {
+  description = "GitHub username for ghcr.io authentication"
+  type        = string
+  default     = ""
+}