From 76c5609684d3eea2a29ed52b2aec390bc5c5a938 Mon Sep 17 00:00:00 2001 From: andheroe Date: Sun, 28 Sep 2025 09:47:13 +0100 Subject: [PATCH] Add ClickBench benchmark + use Embucket experimental build --- benchmark/README.md | 117 +++++-- benchmark/benchmark.py | 5 +- benchmark/clickbench/__init__.py | 45 +++ benchmark/clickbench/clickbench_ddl.py | 134 +++++++ benchmark/clickbench/clickbench_queries.py | 328 ++++++++++++++++++ .../clickbench/clickbench_table_names.py | 62 ++++ benchmark/data_preparation.py | 87 +++-- benchmark/infrastructure/README.md | 74 +++- benchmark/infrastructure/bootstrap.sh | 44 +++ benchmark/infrastructure/docker-compose.yml | 2 +- benchmark/infrastructure/env.tpl | 4 + benchmark/infrastructure/main.tf | 2 + .../infrastructure/terraform.tfvars.example | 5 + benchmark/infrastructure/variables.tf | 13 + 14 files changed, 855 insertions(+), 67 deletions(-) create mode 100644 benchmark/clickbench/__init__.py create mode 100644 benchmark/clickbench/clickbench_ddl.py create mode 100644 benchmark/clickbench/clickbench_queries.py create mode 100644 benchmark/clickbench/clickbench_table_names.py diff --git a/benchmark/README.md b/benchmark/README.md index 142485c42..29873a66b 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -1,6 +1,6 @@ ## Overview -This benchmark tool executes queries derived from TPC-H against both Snowflake and Embucket with cache-clearing operations to ensure clean, cache-free performance measurements. For Snowflake, it uses warehouse suspend/resume operations. For Embucket, it restarts the Docker container before each query to eliminate internal caching. It provides detailed timing metrics including compilation time, execution time, and total elapsed time. +This benchmark tool executes queries from multiple benchmark suites (TPC-H, ClickBench, TPC-DS) against both Snowflake and Embucket with cache-clearing operations to ensure clean, cache-free performance measurements. For Snowflake, it uses warehouse suspend/resume operations. For Embucket, it restarts the Docker container before each query to eliminate internal caching. It provides detailed timing metrics including compilation time, execution time, and total elapsed time. ## TPC Legal Considerations @@ -14,9 +14,12 @@ Throughout this document and when talking about these benchmarks, you will see t ## Features +- **Multiple Benchmark Types**: Supports TPC-H, ClickBench, and TPC-DS benchmark suites - **Cache Isolation**: - **Snowflake**: Suspends and resumes warehouse before each query - **Embucket**: Restarts Docker container before each query to clear internal cache +- **Flexible Caching Options**: Can run with or without cache clearing (`--no-cache` flag) +- **Command Line Interface**: Full CLI support for system selection, benchmark type, and run configuration - **Result Cache Disabled**: Ensures no result caching affects benchmark results - **Comprehensive Metrics**: Tracks compilation time, execution time, and row counts - **CSV Export**: Saves results to CSV files for further analysis @@ -51,37 +54,79 @@ SNOWFLAKE_WAREHOUSE=your_warehouse **For Embucket (when using infrastructure):** ```bash -EMBUCKET_SQL_HOST=your_ec2_instance_ip -EMBUCKET_SQL_PORT=3000 -EMBUCKET_SQL_PROTOCOL=http +EMBUCKET_HOST=your_ec2_instance_ip +EMBUCKET_PORT=3000 +EMBUCKET_PROTOCOL=http EMBUCKET_USER=embucket EMBUCKET_PASSWORD=embucket EMBUCKET_ACCOUNT=embucket -EMBUCKET_DATABASE=embucket -EMBUCKET_SCHEMA=public +EMBUCKET_DATABASE=benchmark_database +EMBUCKET_SCHEMA=benchmark_schema EMBUCKET_INSTANCE=your_instance_name -EMBUCKET_DATASET=your_dataset_name SSH_KEY_PATH=~/.ssh/id_rsa ``` +**Benchmark Configuration:** +```bash +BENCHMARK_TYPE=tpch # Options: tpch, clickbench, tpcds +DATASET_S3_BUCKET=embucket-testdata +DATASET_PATH=tpch/01 # Path within S3 bucket +SNOWFLAKE_WAREHOUSE_SIZE=XSMALL +AWS_ACCESS_KEY_ID=your_aws_access_key_id +AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key +``` + ## Usage -Run the benchmark: +### Command Line Interface + +The benchmark supports comprehensive command-line options: + ```bash +# Run both Snowflake and Embucket with TPC-H (default) python benchmark.py + +# Run only Embucket with TPC-H +python benchmark.py --system embucket + +# Run only Snowflake with TPC-H +python benchmark.py --system snowflake + +# Run ClickBench on both systems +python benchmark.py --benchmark-type clickbench + +# Run TPC-DS on Embucket only +python benchmark.py --system embucket --benchmark-type tpcds + +# Run with caching enabled (no container restarts/warehouse suspends) +python benchmark.py --system embucket + +# Run with caching disabled (force cache clearing) +python benchmark.py --system embucket --no-cache + +# Custom number of runs and dataset path +python benchmark.py --runs 5 --dataset-path tpch/100 ``` -**Current Behavior**: By default, the benchmark runs **only Embucket** benchmarks for 3 iterations. To run both Snowflake and Embucket with comparisons, you need to modify the `__main__` section in `benchmark.py` to call `run_benchmark(i + 1)` instead of `run_embucket_benchmark(i + 1)`. +### Command Line Arguments + +- `--system`: Choose platform (`snowflake`, `embucket`, `both`) - default: `both` +- `--runs`: Number of benchmark runs - default: `3` +- `--benchmark-type`: Benchmark suite (`tpch`, `clickbench`, `tpcds`) - default: `tpch` +- `--dataset-path`: Override DATASET_PATH environment variable +- `--no-cache`: Force cache clearing (warehouse suspend for Snowflake, container restart for Embucket) + +### Benchmark Process The benchmark will: -1. Connect to the configured platform (Embucket by default, or both if modified) -2. Execute each query derived from TPC-H with cache-clearing operations: - - **Snowflake**: Warehouse suspend/resume before each query - - **Embucket**: Docker container restart before each query +1. Connect to the configured platform(s) +2. Execute each query from the selected benchmark suite with cache-clearing operations: + - **Snowflake**: Warehouse suspend/resume before each query (if `--no-cache`) + - **Embucket**: Docker container restart before each query (if `--no-cache`) 3. Collect performance metrics from query history 4. Display results and comparisons (if both platforms are run) 5. Save detailed results to CSV files -6. Calculate averages after 3 runs are completed +6. Calculate averages after all runs are completed ## Embucket Container Restart Functionality @@ -95,8 +140,8 @@ For Embucket benchmarks, the system automatically restarts the Docker container - Creates a fresh database connection and executes the query **Requirements:** -- `EMBUCKET_SQL_HOST` set to your EC2 instance IP -- `EMBUCKET_INSTANCE` and `EMBUCKET_DATASET` for result organization +- `EMBUCKET_HOST` set to your EC2 instance IP +- `EMBUCKET_INSTANCE` for result organization - `SSH_KEY_PATH` pointing to your private key (default: `~/.ssh/id_rsa`) - SSH access to the EC2 instance running Embucket @@ -115,16 +160,19 @@ The benchmark provides: - **Total Times**: Aggregated compilation and execution times **File Organization:** -- Snowflake results: `snowflake_tpch_results/{schema}/{warehouse}/` -- Embucket results: `embucket_tpch_results/{dataset}/{instance}/` +- Snowflake results: `snowflake_{benchmark_type}_results/{schema}/{warehouse}/` +- Embucket results: `embucket_{benchmark_type}_results/{dataset}/{instance}/` + +Where `{benchmark_type}` is one of: `tpch`, `clickbench`, or `tpcds` ## Files - `benchmark.py` - Main benchmark script with restart functionality - `docker_manager.py` - Docker container management for Embucket restarts - `utils.py` - Connection utilities for Snowflake and Embucket -- `tpch_queries.py` - Query definitions derived from TPC-H -- `tpcds_queries.py` - Query definitions derived from TPC-DS (for future use) +- `tpch/` - TPC-H benchmark utilities package (queries, DDL, table names) +- `clickbench/` - ClickBench benchmark utilities package (queries, DDL, table names) +- `tpcds/` - TPC-DS benchmark utilities package (queries, DDL, table names) - `calculate_average.py` - Result averaging and analysis - `config.py` - Configuration utilities - `data_preparation.py` - Data preparation utilities @@ -132,20 +180,27 @@ The benchmark provides: - `env_example` - Example environment configuration file - `infrastructure/` - Terraform infrastructure for EC2/Embucket deployment - `tpch-datagen/` - TPC-H data generation infrastructure -- `tpch/` - TPC-H benchmark utilities package (queries, DDL, table names) -- `tpcds_ddl/` - TPC-DS table definitions for Embucket -## Customizing Benchmark Behavior +## Benchmark Types -**Default**: The benchmark runs only Embucket tests for 3 iterations. +### TPC-H (Default) +Derived from the TPC-H decision support benchmark. Includes 22 complex analytical queries testing various aspects of data warehousing performance. -**To run both Snowflake and Embucket with comparisons**: Modify the `__main__` section in `benchmark.py`: -```python -if __name__ == "__main__": - for i in range(3): - print(f"Run {i + 1} of 3") - run_benchmark(i + 1) # Change from run_embucket_benchmark(i + 1) -``` +### ClickBench +Single-table analytical benchmark focusing on aggregation performance. Uses the `hits` table with web analytics data. + +### TPC-DS +Derived from the TPC-DS decision support benchmark. More complex than TPC-H with 99 queries testing advanced analytical scenarios. + +## Environment Variables + +The benchmark behavior can be controlled through environment variables in your `.env` file: + +- `BENCHMARK_TYPE`: Default benchmark type (`tpch`, `clickbench`, `tpcds`) +- `DATASET_PATH`: Path within S3 bucket for dataset location +- `DATASET_S3_BUCKET`: S3 bucket containing benchmark datasets +- `EMBUCKET_HOST`: EC2 instance IP for Embucket connection +- `SSH_KEY_PATH`: Path to SSH private key for container restarts ## Requirements diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 6926e4f00..a1f39435f 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -7,6 +7,7 @@ from utils import create_snowflake_connection from utils import create_embucket_connection from tpch import parametrize_tpch_queries +from clickbench import parametrize_clickbench_queries from docker_manager import create_docker_manager from constants import SystemType @@ -286,6 +287,8 @@ def get_queries_for_benchmark(benchmark_type: str, for_embucket: bool) -> List[T """Get appropriate queries based on the benchmark type.""" if benchmark_type == "tpch": return parametrize_tpch_queries(fully_qualified_names_for_embucket=for_embucket) + elif benchmark_type == "clickbench": + return parametrize_clickbench_queries(fully_qualified_names_for_embucket=for_embucket) elif benchmark_type == "tpcds": raise NotImplementedError("TPC-DS benchmarks not yet implemented") else: @@ -433,7 +436,7 @@ def parse_args(): parser = argparse.ArgumentParser(description="Run benchmarks on Snowflake and/or Embucket") parser.add_argument("--system", choices=["snowflake", "embucket", "both"], default="both") parser.add_argument("--runs", type=int, default=3) - parser.add_argument("--benchmark-type", choices=["tpch", "tpcds"], default=os.environ.get("BENCHMARK_TYPE", "tpch")) + parser.add_argument("--benchmark-type", choices=["tpch", "clickbench", "tpcds"], default=os.environ.get("BENCHMARK_TYPE", "tpch")) parser.add_argument("--dataset-path", help="Override the DATASET_PATH environment variable") parser.add_argument("--no-cache", action="store_true", help="Disable caching (force warehouse suspend and USE_CACHED_RESULT=False for Snowflake, force container restart for Embucket)") return parser.parse_args() diff --git a/benchmark/clickbench/__init__.py b/benchmark/clickbench/__init__.py new file mode 100644 index 000000000..f62b41e5a --- /dev/null +++ b/benchmark/clickbench/__init__.py @@ -0,0 +1,45 @@ +""" +ClickBench benchmark utilities package. + +This package contains all ClickBench related functionality including: +- Table name configuration and parametrization +- Query definitions with parametrized table names +- DDL statements with parametrized table names + +Main exports: +- parametrize_clickbench_queries: Parametrize ClickBench queries (requires explicit parameter) +- parametrize_clickbench_ddl: Parametrize ClickBench DDL statements (requires explicit parameter) +- CLICKBENCH_TABLE_NAMES: Raw table name mappings +- get_table_names: Get parametrized table names (requires explicit parameter) +- parametrize_clickbench_statements: Generic parametrization function (requires explicit parameter) + +Note: All functions require explicit fully_qualified_names_for_embucket parameter. +No pre-computed constants are provided to enforce explicit parameter usage. +""" + +from .clickbench_table_names import ( + CLICKBENCH_TABLE_NAMES, + get_table_names, + parametrize_clickbench_statements +) + +from .clickbench_queries import ( + parametrize_clickbench_queries, +) + +from .clickbench_ddl import ( + parametrize_clickbench_ddl, +) + +__all__ = [ + # Table names and core functions + 'CLICKBENCH_TABLE_NAMES', + 'get_table_names', + 'parametrize_clickbench_statements', + + # Query functions + 'parametrize_clickbench_queries', + + # DDL functions + 'parametrize_clickbench_ddl', +] \ No newline at end of file diff --git a/benchmark/clickbench/clickbench_ddl.py b/benchmark/clickbench/clickbench_ddl.py new file mode 100644 index 000000000..8125f909a --- /dev/null +++ b/benchmark/clickbench/clickbench_ddl.py @@ -0,0 +1,134 @@ +import os + +from .clickbench_table_names import parametrize_clickbench_statements + +# ClickBench DDL statement with parametrized table name +_CLICKBENCH_DDL_RAW = [ + ( + "hits", + """ + -- Snowflake-like DDL for ClickBench hits table + CREATE OR REPLACE TABLE {HITS_TABLE} ( + WatchID BIGINT, + JavaEnable SMALLINT, + Title VARCHAR, + GoodEvent SMALLINT, + EventTime BIGINT, + EventDate SMALLINT, + CounterID INTEGER, + ClientIP INTEGER, + RegionID INTEGER, + UserID BIGINT, + CounterClass SMALLINT, + OS SMALLINT, + UserAgent SMALLINT, + URL VARCHAR, + Referer VARCHAR, + IsRefresh SMALLINT, + RefererCategoryID SMALLINT, + RefererRegionID INTEGER, + URLCategoryID SMALLINT, + URLRegionID INTEGER, + ResolutionWidth SMALLINT, + ResolutionHeight SMALLINT, + ResolutionDepth SMALLINT, + FlashMajor SMALLINT, + FlashMinor SMALLINT, + FlashMinor2 VARCHAR, + NetMajor SMALLINT, + NetMinor SMALLINT, + UserAgentMajor SMALLINT, + UserAgentMinor VARCHAR, + CookieEnable SMALLINT, + JavascriptEnable SMALLINT, + IsMobile SMALLINT, + MobilePhone SMALLINT, + MobilePhoneModel VARCHAR, + Params VARCHAR, + IPNetworkID INTEGER, + TraficSourceID SMALLINT, + SearchEngineID SMALLINT, + SearchPhrase VARCHAR, + AdvEngineID SMALLINT, + IsArtifical SMALLINT, + WindowClientWidth SMALLINT, + WindowClientHeight SMALLINT, + ClientTimeZone SMALLINT, + ClientEventTime BIGINT, + SilverlightVersion1 SMALLINT, + SilverlightVersion2 SMALLINT, + SilverlightVersion3 INTEGER, + SilverlightVersion4 SMALLINT, + PageCharset VARCHAR, + CodeVersion INTEGER, + IsLink SMALLINT, + IsDownload SMALLINT, + IsNotBounce SMALLINT, + FUniqID BIGINT, + OriginalURL VARCHAR, + HID INTEGER, + IsOldCounter SMALLINT, + IsEvent SMALLINT, + IsParameter SMALLINT, + DontCountHits SMALLINT, + WithHash SMALLINT, + HitColor VARCHAR, + LocalEventTime BIGINT, + Age SMALLINT, + Sex SMALLINT, + Income SMALLINT, + Interests SMALLINT, + Robotness SMALLINT, + RemoteIP INTEGER, + WindowName INTEGER, + OpenerName INTEGER, + HistoryLength SMALLINT, + BrowserLanguage VARCHAR, + BrowserCountry VARCHAR, + SocialNetwork VARCHAR, + SocialAction VARCHAR, + HTTPError SMALLINT, + SendTiming INTEGER, + DNSTiming INTEGER, + ConnectTiming INTEGER, + ResponseStartTiming INTEGER, + ResponseEndTiming INTEGER, + FetchTiming INTEGER, + SocialSourceNetworkID SMALLINT, + SocialSourcePage VARCHAR, + ParamPrice BIGINT, + ParamOrderID VARCHAR, + ParamCurrency VARCHAR, + ParamCurrencyID SMALLINT, + OpenstatServiceName VARCHAR, + OpenstatCampaignID VARCHAR, + OpenstatAdID VARCHAR, + OpenstatSourceID VARCHAR, + UTMSource VARCHAR, + UTMMedium VARCHAR, + UTMCampaign VARCHAR, + UTMContent VARCHAR, + UTMTerm VARCHAR, + FromTag VARCHAR, + HasGCLID SMALLINT, + RefererHash BIGINT, + URLHash BIGINT, + CLID INTEGER + ); + """ + ), +] + + +def parametrize_clickbench_ddl(fully_qualified_names_for_embucket): + """ + Replace table name placeholders in ClickBench DDL statements with actual table names. + + Args: + fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format. + If False, use just the default table names. + + Returns: + list: A list of (table_name, parametrized_ddl) tuples. + """ + return parametrize_clickbench_statements(_CLICKBENCH_DDL_RAW, fully_qualified_names_for_embucket) diff --git a/benchmark/clickbench/clickbench_queries.py b/benchmark/clickbench/clickbench_queries.py new file mode 100644 index 000000000..0d6460022 --- /dev/null +++ b/benchmark/clickbench/clickbench_queries.py @@ -0,0 +1,328 @@ +import os +import re + +from .clickbench_table_names import parametrize_clickbench_statements + +# Original ClickBench queries with parametrized table names +_CLICKBENCH_QUERIES_RAW = [ + ( + "clickbench-q0", + """ + SELECT COUNT(*) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q1", + """ + SELECT COUNT(*) FROM {HITS_TABLE} WHERE AdvEngineID <> 0; + """ + ), + ( + "clickbench-q2", + """ + SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q3", + """ + SELECT AVG(UserID) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q4", + """ + SELECT COUNT(DISTINCT UserID) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q5", + """ + SELECT COUNT(DISTINCT SearchPhrase) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q6", + """ + SELECT MIN(EventDate), MAX(EventDate) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q7", + """ + SELECT AdvEngineID, COUNT(*) FROM {HITS_TABLE} WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; + """ + ), + ( + "clickbench-q8", + """ + SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} GROUP BY RegionID ORDER BY u DESC LIMIT 10; + """ + ), + ( + "clickbench-q9", + """ + SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM {HITS_TABLE} GROUP BY RegionID ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q10", + """ + SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; + """ + ), + ( + "clickbench-q11", + """ + SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; + """ + ), + ( + "clickbench-q12", + """ + SELECT SearchPhrase, COUNT(*) AS c FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q13", + """ + SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; + """ + ), + ( + "clickbench-q14", + """ + SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q15", + """ + SELECT UserID, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; + """ + ), + ( + "clickbench-q16", + """ + SELECT UserID, SearchPhrase, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; + """ + ), + ( + "clickbench-q17", + """ + SELECT UserID, SearchPhrase, COUNT(*) FROM {HITS_TABLE} GROUP BY UserID, SearchPhrase LIMIT 10; + """ + ), + ( + "clickbench-q18", + """ + SELECT + UserID, + minute(to_timestamp(from_unixtime(EventTime))) AS m, + SearchPhrase, + COUNT(*) AS cnt + FROM {HITS_TABLE} + GROUP BY UserID, m, SearchPhrase + ORDER BY cnt DESC + LIMIT 10; + """ + ), + ( + "clickbench-q19", + """ + SELECT UserID FROM {HITS_TABLE} WHERE UserID = 435090932899640449; + """ + ), + ( + "clickbench-q20", + """ + SELECT COUNT(*) FROM {HITS_TABLE} WHERE URL LIKE '%google%'; + """ + ), + ( + "clickbench-q21", + """ + SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM {HITS_TABLE} WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q22", + """ + SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM {HITS_TABLE} WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q23", + """ + SELECT * FROM {HITS_TABLE} WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; + """ + ), + ( + "clickbench-q24", + """ + SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; + """ + ), + ( + "clickbench-q25", + """ + SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; + """ + ), + ( + "clickbench-q26", + """ + SELECT SearchPhrase FROM {HITS_TABLE} WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; + """ + ), + ( + "clickbench-q27", + """ + SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM {HITS_TABLE} WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; + """ + ), + ( + "clickbench-q28", + """ + SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM {HITS_TABLE} WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; + """ + ), + ( + "clickbench-q29", + """ + SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM {HITS_TABLE}; + """ + ), + ( + "clickbench-q30", + """ + SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q31", + """ + SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q32", + """ + SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM {HITS_TABLE} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q33", + """ + SELECT URL, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY URL ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q34", + """ + SELECT 1, URL, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY 1, URL ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q35", + """ + SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM {HITS_TABLE} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; + """ + ), + ( + "clickbench-q36", + """ + SELECT URL, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; + """ + ), + ( + "clickbench-q37", + """ + SELECT Title, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; + """ + ), + ( + "clickbench-q38", + """ + SELECT URL, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; + """ + ), + ( + "clickbench-q39", + """ + SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; + """ + ), + ( + "clickbench-q40", + """ + SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; + """ + ), + ( + "clickbench-q41", + """ + SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM {HITS_TABLE} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; + """ + ), + ( + "clickbench-q42", + """ + WITH ts AS ( + SELECT + -- minute bucket + DATE_TRUNC('minute', + CAST( + FROM_UNIXTIME( + CAST( + CASE + WHEN EventTime > 20000000000 THEN FLOOR(EventTime / 1000) -- ms → s + ELSE EventTime -- seconds + END AS BIGINT + ) + ) AS TIMESTAMP + ) + ) AS m, + -- day for filtering + CAST( + DATE_TRUNC('day', + CAST( + FROM_UNIXTIME( + CAST( + CASE + WHEN EventTime > 20000000000 THEN FLOOR(EventTime / 1000) + ELSE EventTime + END AS BIGINT + ) + ) AS TIMESTAMP + ) + ) AS DATE + ) AS d + FROM {HITS_TABLE} + WHERE CounterID = 62 + AND IsRefresh = 0 + AND DontCountHits = 0 + ) + SELECT m, COUNT(*) AS PageViews + FROM ts + WHERE d BETWEEN DATE '2013-07-14' AND DATE '2013-07-15' + GROUP BY m + ORDER BY m + LIMIT 10 OFFSET 1000; + """ + ), +] + + +def parametrize_clickbench_queries(fully_qualified_names_for_embucket): + """ + Replace table name placeholders in ClickBench queries with actual table names. + + Args: + fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format. + If False, use just the default table names. + + Returns: + list: A list of (query_name, parametrized_query) tuples. + """ + return parametrize_clickbench_statements(_CLICKBENCH_QUERIES_RAW, fully_qualified_names_for_embucket) diff --git a/benchmark/clickbench/clickbench_table_names.py b/benchmark/clickbench/clickbench_table_names.py new file mode 100644 index 000000000..3c1a2b9b2 --- /dev/null +++ b/benchmark/clickbench/clickbench_table_names.py @@ -0,0 +1,62 @@ +""" +ClickBench table names configuration. + +This module defines the single ClickBench table name and its +corresponding placeholder name used for parametrization. +""" + +# The single ClickBench table name with its parametrization placeholder +CLICKBENCH_TABLE_NAMES = { + 'HITS_TABLE': 'hits' +} + +def get_table_names(fully_qualified_names_for_embucket): + """ + Get table names dictionary with optional fully qualified naming. + + Args: + fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format. + If False, use just the default table names. + + Returns: + dict: Dictionary mapping placeholder names to actual table names. + """ + import os + + table_names = CLICKBENCH_TABLE_NAMES.copy() + + if fully_qualified_names_for_embucket: + # Get database and schema from environment variables + database = os.environ['EMBUCKET_DATABASE'] + schema = os.environ['EMBUCKET_SCHEMA'] + + # Create fully qualified table names + for key, table_name in table_names.items(): + table_names[key] = f"{database}.{schema}.{table_name}" + + return table_names + + +def parametrize_clickbench_statements(statements_raw, fully_qualified_names_for_embucket): + """ + Generic function to parametrize ClickBench statements (queries or DDL) with table names. + + Args: + statements_raw (list): List of (name, statement_sql) tuples with placeholder table names. + fully_qualified_names_for_embucket (bool): Required. If True, use EMBUCKET_DATABASE.EMBUCKET_SCHEMA.tablename format. + If False, use just the default table names. + + Returns: + list: A list of (name, parametrized_statement) tuples. + """ + # Get table names with appropriate formatting + table_names = get_table_names(fully_qualified_names_for_embucket) + + parametrized_statements = [] + + for name, statement_sql in statements_raw: + # Replace table name placeholders + parametrized_sql = statement_sql.format(**table_names) + parametrized_statements.append((name, parametrized_sql)) + + return parametrized_statements diff --git a/benchmark/data_preparation.py b/benchmark/data_preparation.py index b2cd9203c..7691846c4 100644 --- a/benchmark/data_preparation.py +++ b/benchmark/data_preparation.py @@ -1,31 +1,51 @@ import os import argparse from utils import create_embucket_connection, create_snowflake_connection -from tpch import parametrize_tpch_ddl, get_table_names +from tpch import parametrize_tpch_ddl, get_table_names as get_tpch_table_names +from clickbench import parametrize_clickbench_ddl, get_table_names as get_clickbench_table_names from dotenv import load_dotenv from constants import SystemType load_dotenv() -def create_tables(cursor, system): - """Create tables using the consolidated TPC-H DDL statements.""" - print(f"Creating tables for {system}...") - # Get DDL statements with fully qualified/unqualified names for Embucket/Snowflake - if system == SystemType.EMBUCKET: - tpch_ddl = parametrize_tpch_ddl(fully_qualified_names_for_embucket=True) - elif system == SystemType.SNOWFLAKE: - tpch_ddl = parametrize_tpch_ddl(fully_qualified_names_for_embucket=False) +def create_tables(cursor, system, benchmark_type): + """Create tables using the appropriate DDL statements based on benchmark type.""" + print(f"Creating tables for {system} ({benchmark_type})...") + + # Get DDL statements based on benchmark type and system + if benchmark_type == "tpch": + if system == SystemType.EMBUCKET: + ddl_statements = parametrize_tpch_ddl(fully_qualified_names_for_embucket=True) + elif system == SystemType.SNOWFLAKE: + ddl_statements = parametrize_tpch_ddl(fully_qualified_names_for_embucket=False) + else: + raise ValueError("Unsupported system") + elif benchmark_type == "clickbench": + if system == SystemType.EMBUCKET: + ddl_statements = parametrize_clickbench_ddl(fully_qualified_names_for_embucket=True) + elif system == SystemType.SNOWFLAKE: + ddl_statements = parametrize_clickbench_ddl(fully_qualified_names_for_embucket=False) + else: + raise ValueError("Unsupported system") else: - raise ValueError("Unsupported system") - for table_name, ddl_sql in tpch_ddl: + raise ValueError(f"Unsupported benchmark type: {benchmark_type}") + + for table_name, ddl_sql in ddl_statements: print(f"Creating table: {table_name}") cursor.execute(ddl_sql.strip()) -def upload_parquet_to_snowflake_tables(cursor, dataset_path): +def upload_parquet_to_snowflake_tables(cursor, dataset_path, benchmark_type): """Upload parquet files to Snowflake tables from S3 stage.""" - table_names = get_table_names(fully_qualified_names_for_embucket=False) + # Get table names based on benchmark type + if benchmark_type == "tpch": + table_names = get_tpch_table_names(fully_qualified_names_for_embucket=False) + elif benchmark_type == "clickbench": + table_names = get_clickbench_table_names(fully_qualified_names_for_embucket=False) + else: + raise ValueError(f"Unsupported benchmark type: {benchmark_type}") + for table_name in table_names.values(): print(f"Loading data into Snowflake table {table_name}...") s3_path = f"s3://embucket-testdata/{dataset_path}/{table_name}.parquet" @@ -43,10 +63,15 @@ def upload_parquet_to_snowflake_tables(cursor, dataset_path): -def upload_parquet_to_embucket_tables(cursor, dataset_path): +def upload_parquet_to_embucket_tables(cursor, dataset_path, benchmark_type): """Upload parquet files to Embucket tables using COPY INTO.""" - # Get fully qualified table names using the unified logic - table_names = get_table_names(fully_qualified_names_for_embucket=True) + # Get fully qualified table names based on benchmark type + if benchmark_type == "tpch": + table_names = get_tpch_table_names(fully_qualified_names_for_embucket=True) + elif benchmark_type == "clickbench": + table_names = get_clickbench_table_names(fully_qualified_names_for_embucket=True) + else: + raise ValueError(f"Unsupported benchmark type: {benchmark_type}") for placeholder, qualified_table_name in table_names.items(): # Extract bare table name for the S3 path (parquet files use bare names) @@ -57,49 +82,55 @@ def upload_parquet_to_embucket_tables(cursor, dataset_path): cursor.execute(copy_sql) -def prepare_data_for_embucket(dataset_path): +def prepare_data_for_embucket(dataset_path, benchmark_type): """Prepare data for Embucket: generate data, create tables, and load data.""" # Connect to Embucket cursor = create_embucket_connection().cursor() # Create tables - create_tables(cursor, SystemType.EMBUCKET) + create_tables(cursor, SystemType.EMBUCKET, benchmark_type) # Load data into Embucket tables - upload_parquet_to_embucket_tables(cursor, dataset_path) + upload_parquet_to_embucket_tables(cursor, dataset_path, benchmark_type) cursor.close() - print("Embucket data preparation completed successfully.") + print(f"Embucket data preparation completed successfully for {benchmark_type}.") -def prepare_data_for_snowflake(dataset_path): +def prepare_data_for_snowflake(dataset_path, benchmark_type): """Prepare data, create tables, and load data for Snowflake""" # Connect to Snowflake cursor = create_snowflake_connection().cursor() # Create tables - create_tables(cursor, SystemType.SNOWFLAKE) + create_tables(cursor, SystemType.SNOWFLAKE, benchmark_type) # Load data into Snowflake tables - upload_parquet_to_snowflake_tables(cursor, dataset_path) + upload_parquet_to_snowflake_tables(cursor, dataset_path, benchmark_type) cursor.close() - print("Snowflake data preparation completed successfully.") + print(f"Snowflake data preparation completed successfully for {benchmark_type}.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Prepare data for Embucket/Snowflake benchmarks") parser.add_argument("--system", type=str, choices=["embucket", "snowflake", "both"], default="both", help="Which system to prepare data for") + parser.add_argument("--benchmark-type", type=str, choices=["tpch", "clickbench"], + default=os.environ.get("BENCHMARK_TYPE", "tpch"), + help="Benchmark type (default: from env or 'tpch')") parser.add_argument("--dataset-path", type=str, default=os.environ.get("DATASET_PATH", "tpch/1"), help="Dataset path in format 'dataset/scale' (default: from env or 'tpch/1')") args = parser.parse_args() - # Override environment variable if specified in args + # Override environment variables if specified in args if args.dataset_path: os.environ["DATASET_PATH"] = args.dataset_path + if args.benchmark_type != os.environ.get("BENCHMARK_TYPE", "tpch"): + os.environ["BENCHMARK_TYPE"] = args.benchmark_type - print(f"Preparing data for dataset path: {args.dataset_path}") + print(f"Preparing data for benchmark type: {args.benchmark_type}") + print(f"Dataset path: {args.dataset_path}") if args.system.lower() in ["embucket", "both"]: - prepare_data_for_embucket(args.dataset_path) + prepare_data_for_embucket(args.dataset_path, args.benchmark_type) if args.system.lower() in ["snowflake", "both"]: - prepare_data_for_snowflake(args.dataset_path) + prepare_data_for_snowflake(args.dataset_path, args.benchmark_type) diff --git a/benchmark/infrastructure/README.md b/benchmark/infrastructure/README.md index b65d1bd2e..1db00519f 100644 --- a/benchmark/infrastructure/README.md +++ b/benchmark/infrastructure/README.md @@ -29,6 +29,8 @@ All resources share the same 8-character random suffix (e.g., `a1b2c3d4`), makin **Required variables to set:** - `benchmark_s3_user_key_id` - Your AWS Access Key ID - `benchmark_s3_user_access_key` - Your AWS Secret Access Key + - `github_username` - Your GitHub username + - `github_token` - GitHub Personal Access Token with 'read:packages' permission - `private_key_path` / `public_key_path` - SSH key paths 3. **Deploy Infrastructure** @@ -126,6 +128,28 @@ Your AWS user needs a policy that allows access to S3 buckets with the `embucket This policy allows multiple team members to use the same credentials while each deployment creates its own unique bucket. +### GitHub Container Registry Authentication + +The infrastructure automatically authenticates with GitHub Container Registry (ghcr.io) to pull the Embucket Docker image. This requires: + +1. **GitHub Personal Access Token**: Create a token with `read:packages` permission + - Go to GitHub Settings → Developer settings → Personal access tokens → Tokens (classic) + - Generate new token with `read:packages` scope + - Copy the token value + +2. **Configuration**: Add your GitHub credentials to `terraform.tfvars`: + ```hcl + github_username = "your-github-username" + github_token = "ghp_your_personal_access_token" + ``` + +3. **Automatic Authentication**: During deployment, the bootstrap script will: + - Extract GitHub credentials from the `.env` file + - Authenticate with `ghcr.io` using `docker login` + - Pull the `ghcr.io/embucket/embucket:experimental` image + +**Note**: If GitHub credentials are not provided, the deployment will skip ghcr.io authentication and may fail when trying to pull the Embucket image. + ### Team Collaboration - **Shared Credentials**: All team members can use the same AWS user credentials @@ -176,9 +200,34 @@ Run validation manually: Once deployed, you can: 1. SSH to the instance: `$(terraform output -raw ssh_command)` 2. Run custom benchmarks against the Embucket API -3. Monitor performance using AWS CloudWatch +3. Monitor performance using AWS CloudWatch or the included monitoring script 4. Scale instance type up/down as needed +### System Monitoring + +The infrastructure includes a monitoring script to track system and Embucket container performance: + +```bash +# Real-time monitoring (default) +./monitor_ram.sh + +# Monitor every 5 seconds +./monitor_ram.sh -i 5 + +# Monitor for 60 seconds then exit +./monitor_ram.sh -t 60 + +# Single snapshot +./monitor_ram.sh -o +``` + +The monitoring script displays: +- **System RAM**: Memory usage percentage and absolute values +- **System CPU**: CPU usage percentage across all cores +- **Embucket Container**: Container-specific RAM and CPU usage + +This is particularly useful during benchmarking to ensure the system has adequate resources and to identify performance bottlenecks. + ## Cleanup To destroy all resources: @@ -195,6 +244,7 @@ terraform destroy - `user_data.sh` - EC2 initialization script - `bootstrap.sh` - Embucket installation and startup script - `validate.sh` - Deployment validation script +- `monitor_ram.sh` - System and Embucket container monitoring script - `docker-compose.yml` - Embucket container configuration - `env.tpl` - Environment template for Embucket configuration - `docker/db_init.py` - Database initialization script @@ -209,7 +259,9 @@ terraform destroy 3. **Database initialization failed**: Verify S3 credentials and permissions 4. **S3 access denied**: Ensure your AWS user has the required S3 policy (see above) 5. **Empty credentials**: Make sure you've set `benchmark_s3_user_key_id` and `benchmark_s3_user_access_key` in `terraform.tfvars` -6. **Performance issues**: Consider upgrading instance type +6. **GitHub authentication failed**: Verify `github_username` and `github_token` are set correctly +7. **Docker image pull failed**: Check GitHub token has `read:packages` permission +8. **Performance issues**: Consider upgrading instance type ### Apple Silicon Troubleshooting @@ -237,26 +289,36 @@ docker-compose ps docker-compose logs embucket docker-compose logs db-init +# Check GitHub authentication +docker images | grep ghcr.io +docker login ghcr.io # Test manual login + # Check system resources htop df -h # If credentials weren't provided during deployment, you can set them up manually: -# Edit the .env file with your AWS credentials and restart Embucket +# Edit the .env file with your AWS and GitHub credentials and restart Embucket nano .env docker-compose up -d ``` ### Manual Credential Setup -If you didn't provide AWS credentials in `terraform.tfvars`, you can set them up after deployment: +If you didn't provide AWS or GitHub credentials in `terraform.tfvars`, you can set them up after deployment: 1. SSH to the instance: `$(terraform output -raw ssh_command)` -2. Edit the `.env` file to add your AWS credentials: +2. Edit the `.env` file to add your credentials: ```bash nano .env # Add or update these lines: # AWS_ACCESS_KEY_ID=your-access-key-id # AWS_SECRET_ACCESS_KEY=your-secret-access-key + # GITHUB_USERNAME=your-github-username + # GITHUB_TOKEN=your-github-token + ``` +3. Authenticate with GitHub Container Registry: + ```bash + echo "$GITHUB_TOKEN" | docker login ghcr.io -u "$GITHUB_USERNAME" --password-stdin ``` -3. Start Embucket: `docker-compose up -d` +4. Start Embucket: `docker-compose up -d` diff --git a/benchmark/infrastructure/bootstrap.sh b/benchmark/infrastructure/bootstrap.sh index ecd09367b..278536384 100644 --- a/benchmark/infrastructure/bootstrap.sh +++ b/benchmark/infrastructure/bootstrap.sh @@ -62,6 +62,23 @@ fi # Verify credentials are in .env file and not empty if grep -q "AWS_ACCESS_KEY_ID=" .env && [ "$(grep AWS_ACCESS_KEY_ID= .env | cut -d= -f2)" != "" ]; then echo "✅ AWS credentials found in .env file" + + # Authenticate with GitHub Container Registry if credentials are provided + if grep -q "GITHUB_TOKEN=" .env && [ "$(grep GITHUB_TOKEN= .env | cut -d= -f2)" != "" ]; then + echo "🔐 Authenticating with GitHub Container Registry..." + GITHUB_TOKEN=$(grep GITHUB_TOKEN= .env | cut -d= -f2) + GITHUB_USERNAME=$(grep GITHUB_USERNAME= .env | cut -d= -f2) + echo "$GITHUB_TOKEN" | sudo -u ec2-user docker login ghcr.io -u "$GITHUB_USERNAME" --password-stdin + if [ $? -eq 0 ]; then + echo "✅ Successfully authenticated with ghcr.io" + else + echo "❌ Failed to authenticate with ghcr.io" + exit 1 + fi + else + echo "⚠️ No GitHub credentials found - skipping ghcr.io authentication" + fi + echo "=========================================" echo "Starting Embucket with automatic database initialization..." echo "Running: docker-compose up -d" @@ -102,9 +119,36 @@ for i in {1..30}; do break fi echo "Attempt $i/30: Waiting for Embucket API..." + + # Show container logs if API is not ready after several attempts + if [ $i -eq 5 ] || [ $i -eq 15 ] || [ $i -eq 25 ]; then + echo "=========================================" + echo "🔍 Container status and logs (attempt $i):" + sudo -u ec2-user docker-compose ps + echo "" + echo "📋 Embucket container logs (last 20 lines):" + sudo -u ec2-user docker-compose logs --tail=20 embucket + echo "=========================================" + fi + sleep 10 done +# If health check failed, show detailed logs +if ! curl -s http://localhost:3000/health > /dev/null 2>&1; then + echo "❌ Embucket API health check failed after 30 attempts" + echo "=========================================" + echo "🔍 Final container status:" + sudo -u ec2-user docker-compose ps + echo "" + echo "📋 Full Embucket container logs:" + sudo -u ec2-user docker-compose logs embucket + echo "" + echo "📋 Database init container logs:" + sudo -u ec2-user docker-compose logs db-init + echo "=========================================" +fi + # Check if database initialization was successful echo "Checking database initialization..." sleep 30 diff --git a/benchmark/infrastructure/docker-compose.yml b/benchmark/infrastructure/docker-compose.yml index 35c4dcc9f..253ed4616 100644 --- a/benchmark/infrastructure/docker-compose.yml +++ b/benchmark/infrastructure/docker-compose.yml @@ -1,6 +1,6 @@ services: embucket: - image: embucket/embucket:0.2.0 + image: ghcr.io/embucket/embucket:experimental container_name: embucket-benchmark environment: # Iceberg Catalog settings diff --git a/benchmark/infrastructure/env.tpl b/benchmark/infrastructure/env.tpl index c31af4db3..1b6be4600 100644 --- a/benchmark/infrastructure/env.tpl +++ b/benchmark/infrastructure/env.tpl @@ -14,3 +14,7 @@ SCHEMA_NAME=benchmark_schema VOLUME_NAME=benchmark_volume EMBUCKET_HOST=localhost EMBUCKET_PORT=3000 + +# GitHub Container Registry credentials +GITHUB_TOKEN=${github_token} +GITHUB_USERNAME=${github_username} diff --git a/benchmark/infrastructure/main.tf b/benchmark/infrastructure/main.tf index 5897b3b81..f1fe99203 100644 --- a/benchmark/infrastructure/main.tf +++ b/benchmark/infrastructure/main.tf @@ -166,6 +166,8 @@ resource "local_file" "env_file" { vite_api_url = "http://${aws_instance.embucket_benchmark.public_ip}:3000" instance_public_ip = aws_instance.embucket_benchmark.public_ip private_key_path = var.private_key_path + github_token = var.github_token + github_username = var.github_username }) filename = "${path.module}/.env" } diff --git a/benchmark/infrastructure/terraform.tfvars.example b/benchmark/infrastructure/terraform.tfvars.example index e46e194d8..9a4dd3de7 100644 --- a/benchmark/infrastructure/terraform.tfvars.example +++ b/benchmark/infrastructure/terraform.tfvars.example @@ -28,3 +28,8 @@ environment = "benchmark" # that have S3 access to the benchmark bucket benchmark_s3_user_key_id = "" # Your existing AWS user's access key ID benchmark_s3_user_access_key = "" # Your existing AWS user's secret access key + +# GitHub Container Registry Authentication +# Required for pulling from ghcr.io/embucket/embucket:experimental +github_username = "" # Your GitHub username +github_token = "" # GitHub Personal Access Token with 'read:packages' permission diff --git a/benchmark/infrastructure/variables.tf b/benchmark/infrastructure/variables.tf index 7a0d14900..aecea80f0 100644 --- a/benchmark/infrastructure/variables.tf +++ b/benchmark/infrastructure/variables.tf @@ -60,3 +60,16 @@ variable "aws_profile" { type = string default = null } + +variable "github_token" { + description = "GitHub Personal Access Token for pulling from ghcr.io" + type = string + default = "" + sensitive = true +} + +variable "github_username" { + description = "GitHub username for ghcr.io authentication" + type = string + default = "" +}