refactor: update benchmark logic to use dataset path and warehouse si… (#1771)

YevheniiNiestierov · github-actions[bot] · web-flow · commit 79c88d8144d5 · 2025-09-26T18:27:08.000+03:00
* refactor: update benchmark logic to use dataset path and warehouse size, update error catching for loading data to snowflake, update env example

* refactor: remove odd logic in run_on_emb logic and fixed path generation for Embucket

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -24,13 +24,14 @@
 logger = logging.getLogger(__name__)
 
 
-def get_results_path(system: SystemType, benchmark_type: str, scale_factor: str,
-                     warehouse_or_instance: str, run_number: Optional[int] = None) -> str:
+def get_results_path(system: SystemType, benchmark_type: str, dataset_path: str,
+                     instance: str, warehouse_size: str = None, run_number: Optional[int] = None) -> str:
     """Generate path for storing benchmark results."""
     if system == SystemType.SNOWFLAKE:
-        base_path = f"result/snowflake_{benchmark_type}_results/{scale_factor}/{warehouse_or_instance}"
+        # Use warehouse size in the path instead of warehouse name
+        base_path = f"result/snowflake_{benchmark_type}_results/{dataset_path}/{warehouse_size}"
     elif system == SystemType.EMBUCKET:
-        base_path = f"result/embucket_{benchmark_type}_results/{scale_factor}/{warehouse_or_instance}"
+        base_path = f"result/embucket_{benchmark_type}_results/{dataset_path}/{instance}"
     else:
         raise ValueError(f"Unsupported system: {system}")
 
@@ -149,7 +150,7 @@ def run_on_sf(cursor, warehouse, tpch_queries):
     return results
 
 
-def run_on_emb(cursor, tpch_queries):
+def run_on_emb(tpch_queries):
     """Run TPCH queries on Embucket with container restart before each query."""
     docker_manager = create_docker_manager()
     executed_query_ids = []
@@ -271,11 +272,11 @@ def run_snowflake_benchmark(run_number: int):
     # Get benchmark configuration from environment variables
     benchmark_type = os.environ.get("BENCHMARK_TYPE", "tpch")
     warehouse = os.environ["SNOWFLAKE_WAREHOUSE"]
-    dataset = os.environ["DATASET_NAME"]
-    scale_factor = os.environ["DATASET_SCALE_FACTOR"]
+    warehouse_size = os.environ["SNOWFLAKE_WAREHOUSE_SIZE"]
+    dataset_path = os.environ["DATASET_PATH"]
 
     logger.info(f"Starting Snowflake {benchmark_type} benchmark run {run_number}")
-    logger.info(f"Dataset: {dataset}, Schema: {scale_factor}, Warehouse: {warehouse}")
+    logger.info(f"Dataset: {dataset_path}, Warehouse: {warehouse}, Size: {warehouse_size}")
 
     # Get queries and run benchmark
     queries = get_queries_for_benchmark(benchmark_type, for_embucket=False)
@@ -286,9 +287,9 @@ def run_snowflake_benchmark(run_number: int):
     # Disable query result caching for benchmark
     sf_cursor.execute("ALTER SESSION SET USE_CACHED_RESULT = FALSE;")
 
-    sf_results = run_on_sf(sf_cursor,warehouse, queries)
+    sf_results = run_on_sf(sf_cursor, warehouse, queries)
 
-    results_path = get_results_path(SystemType.SNOWFLAKE, benchmark_type, scale_factor, warehouse, run_number)
+    results_path = get_results_path(SystemType.SNOWFLAKE, benchmark_type, dataset_path, warehouse, warehouse_size, run_number)
     os.makedirs(os.path.dirname(results_path), exist_ok=True)
     save_results_to_csv(sf_results, filename=results_path, system=SystemType.SNOWFLAKE)
 
@@ -298,50 +299,49 @@ def run_snowflake_benchmark(run_number: int):
     sf_connection.close()
 
     # Check if we have 3 CSV files ready and calculate averages if so
-    results_dir = get_results_path(SystemType.SNOWFLAKE, benchmark_type, scale_factor, warehouse)
+    results_dir = get_results_path(SystemType.SNOWFLAKE, benchmark_type, dataset_path, warehouse, warehouse_size)
     csv_files = glob.glob(os.path.join(results_dir, "snowflake_results_run_*.csv"))
     if len(csv_files) == 3:
         logger.info("Found 3 CSV files. Calculating averages...")
         calculate_benchmark_averages(
-            scale_factor,
-            warehouse,
+            dataset_path,
+            warehouse_size,  # Pass warehouse size instead of name
             SystemType.SNOWFLAKE,
             benchmark_type
         )
 
     return sf_results
 
 
+
 def run_embucket_benchmark(run_number: int):
     """Run benchmark on Embucket with container restarts."""
     # Get benchmark configuration from environment variables
     benchmark_type = os.environ.get("BENCHMARK_TYPE", "tpch")
     instance = os.environ["EMBUCKET_INSTANCE"]
-    dataset = os.environ.get("EMBUCKET_DATASET", os.environ["DATASET_NAME"])
-    scale_factor = os.environ["DATASET_SCALE_FACTOR"]
+    dataset_path = os.environ.get("EMBUCKET_DATASET_PATH", os.environ["DATASET_PATH"])
 
     logger.info(f"Starting Embucket {benchmark_type} benchmark run {run_number}")
-    logger.info(f"Instance: {instance}, Dataset: {dataset}, Scale Factor: {scale_factor}")
+    logger.info(f"Instance: {instance}, Dataset: {dataset_path}")
 
     # Get queries and docker manager
     queries = get_queries_for_benchmark(benchmark_type, for_embucket=True)
-    docker_manager = create_docker_manager()
 
     # Run benchmark
-    emb_results = run_on_emb(docker_manager, queries)
+    emb_results = run_on_emb(queries)
 
-    results_path = get_results_path(SystemType.EMBUCKET, benchmark_type, scale_factor, instance, run_number)
+    results_path = get_results_path(SystemType.EMBUCKET, benchmark_type, dataset_path, instance, run_number=run_number)
     os.makedirs(os.path.dirname(results_path), exist_ok=True)
     save_results_to_csv(emb_results, filename=results_path, system=SystemType.EMBUCKET)
     logger.info(f"Embucket benchmark results saved to: {results_path}")
 
     # Check if we have 3 CSV files ready and calculate averages
-    results_dir = get_results_path(SystemType.EMBUCKET, benchmark_type, scale_factor, instance)
+    results_dir = get_results_path(SystemType.EMBUCKET, benchmark_type, dataset_path, instance)
     csv_files = glob.glob(os.path.join(results_dir, "embucket_results_run_*.csv"))
     if len(csv_files) == 3:
         logger.info("Found 3 CSV files. Calculating averages...")
         calculate_benchmark_averages(
-            scale_factor,
+            dataset_path,
             instance,
             SystemType.EMBUCKET,
             benchmark_type
@@ -398,8 +398,7 @@ def parse_args():
     parser.add_argument("--platform", choices=["snowflake", "embucket", "both"], default="both")
     parser.add_argument("--runs", type=int, default=3)
     parser.add_argument("--benchmark-type", choices=["tpch", "tpcds"], default=os.environ.get("BENCHMARK_TYPE", "tpch"))
-    parser.add_argument("--dataset-name", help="Override the DATASET_NAME environment variable")
-    parser.add_argument("--scale-factor", help="Override the DATASET_SCALE_FACTOR environment variable")
+    parser.add_argument("--dataset-path", help="Override the DATASET_PATH environment variable")
     return parser.parse_args()
 
 
@@ -410,11 +409,8 @@ def parse_args():
     if args.benchmark_type != os.environ.get("BENCHMARK_TYPE", "tpch"):
         os.environ["BENCHMARK_TYPE"] = args.benchmark_type
 
-    if args.dataset_name:
-        os.environ["DATASET_NAME"] = args.dataset_name
-
-    if args.scale_factor:
-        os.environ["DATASET_SCALE_FACTOR"] = args.scale_factor
+    if args.dataset_path:
+        os.environ["DATASET_PATH"] = args.dataset_path
 
     # Execute benchmarks based on platform selection
     if args.platform == "snowflake":
diff --git a/benchmark/data_preparation.py b/benchmark/data_preparation.py
@@ -23,13 +23,12 @@ def create_tables(cursor, system):
         cursor.execute(ddl_sql.strip())
 
 
-def upload_parquet_to_snowflake_tables(cursor, dataset, dataset_scale_factor):
+def upload_parquet_to_snowflake_tables(cursor, dataset_path):
     """Upload parquet files to Snowflake tables from S3 stage."""
     table_names = get_table_names(fully_qualified_names_for_embucket=False)
     for table_name in table_names.values():
         print(f"Loading data into Snowflake table {table_name}...")
-        # Load data directly from the S3 stage
-        s3_path = f"s3://embucket-testdata/{dataset}/{dataset_scale_factor}/{table_name}.parquet"
+        s3_path = f"s3://embucket-testdata/{dataset_path}/{table_name}.parquet"
         cursor.execute(f"""
             COPY INTO {table_name}
             FROM '{s3_path}'
@@ -38,9 +37,13 @@ def upload_parquet_to_snowflake_tables(cursor, dataset, dataset_scale_factor):
             FILE_FORMAT = (TYPE = PARQUET)
             MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE;
         """)
+        result = cursor.fetchall()
+        if result and result[0][0] == 'Copy executed with 0 files processed.':
+            raise RuntimeError(f"No files processed for {table_name}. Check S3 path: {s3_path}")
 
 
-def upload_parquet_to_embucket_tables(cursor, dataset, dataset_scale_factor):
+
+def upload_parquet_to_embucket_tables(cursor, dataset_path):
     """Upload parquet files to Embucket tables using COPY INTO."""
     # Get fully qualified table names using the unified logic
     table_names = get_table_names(fully_qualified_names_for_embucket=True)
@@ -50,31 +53,31 @@ def upload_parquet_to_embucket_tables(cursor, dataset, dataset_scale_factor):
         bare_table_name = qualified_table_name.split('.')[-1]
         print(f"Loading data into Embucket table {qualified_table_name}...")
 
-        copy_sql = f"COPY INTO {qualified_table_name} FROM 's3://embucket-testdata/{dataset}/{dataset_scale_factor}/{bare_table_name}.parquet' FILE_FORMAT = (TYPE = PARQUET)"
+        copy_sql = f"COPY INTO {qualified_table_name} FROM 's3://embucket-testdata/{dataset_path}/{bare_table_name}.parquet' FILE_FORMAT = (TYPE = PARQUET)"
         cursor.execute(copy_sql)
 
 
-def prepare_data_for_embucket(dataset, dataset_scale_factor):
+def prepare_data_for_embucket(dataset_path):
     """Prepare data for Embucket: generate data, create tables, and load data."""
     # Connect to Embucket
     cursor = create_embucket_connection().cursor()
     # Create tables
     create_tables(cursor, SystemType.EMBUCKET)
     # Load data into Embucket tables
-    upload_parquet_to_embucket_tables(cursor, dataset, dataset_scale_factor)
+    upload_parquet_to_embucket_tables(cursor, dataset_path)
 
     cursor.close()
     print("Embucket data preparation completed successfully.")
 
 
-def prepare_data_for_snowflake(dataset, dataset_scale_factor):
+def prepare_data_for_snowflake(dataset_path):
     """Prepare data, create tables, and load data for Snowflake"""
     # Connect to Snowflake
     cursor = create_snowflake_connection().cursor()
     # Create tables
     create_tables(cursor, SystemType.SNOWFLAKE)
     # Load data into Snowflake tables
-    upload_parquet_to_snowflake_tables(cursor, dataset, dataset_scale_factor)
+    upload_parquet_to_snowflake_tables(cursor, dataset_path)
 
     cursor.close()
     print("Snowflake data preparation completed successfully.")
@@ -84,24 +87,19 @@ def prepare_data_for_snowflake(dataset, dataset_scale_factor):
     parser = argparse.ArgumentParser(description="Prepare data for Embucket/Snowflake benchmarks")
     parser.add_argument("--system", type=str, choices=["embucket", "snowflake", "both"],
                         default="both", help="Which system to prepare data for")
-    parser.add_argument("--dataset", type=str, default=os.environ.get("DATASET_NAME", "tpch"),
-                        help="Dataset name (default: from env or 'tpch')")
-    parser.add_argument("--scale", type=str, default=os.environ.get("DATASET_SCALE_FACTOR", "01"),
-                        help="Dataset scale factor (default: from env or '1')")
+    parser.add_argument("--dataset-path", type=str, default=os.environ.get("DATASET_PATH", "tpch/01"),
+                        help="Dataset path in format 'dataset/scale' (default: from env or 'tpch/01')")
 
     args = parser.parse_args()
 
-    # Override environment variables if specified in args
-    if args.dataset is not None:
-        os.environ["DATASET_NAME"] = args.dataset
-
-    if args.scale is not None:
-        os.environ["DATASET_SCALE_FACTOR"] = args.scale
+    # Override environment variable if specified in args
+    if args.dataset_path:
+        os.environ["DATASET_PATH"] = args.dataset_path
 
-    print(f"Preparing data for dataset: {args.dataset}, scale: {args.scale}")
+    print(f"Preparing data for dataset path: {args.dataset_path}")
 
-    # if args.system.lower() in ["embucket", "both"]:
-    #     prepare_data_for_embucket(args.dataset, args.scale)
+    if args.system.lower() in ["embucket", "both"]:
+        prepare_data_for_embucket(args.dataset_path)
 
     if args.system.lower() in ["snowflake", "both"]:
-        prepare_data_for_snowflake(args.dataset, args.scale)
+        prepare_data_for_snowflake(args.dataset_path)
diff --git a/benchmark/env_example b/benchmark/env_example
@@ -2,14 +2,15 @@ SNOWFLAKE_ACCOUNT=your_snowflake_account
 SNOWFLAKE_USER=your_snowflake_user
 SNOWFLAKE_PASSWORD=your_snowflake_password
 SNOWFLAKE_DATABASE=benchmark_db
-SNOWFLAKE_WAREHOUSE=BENCHMARK_WH_XS
+SNOWFLAKE_WAREHOUSE=BENCHMARK_WH
 
 EMBUCKET_INSTANCE=c7i_2xlarge
+SNOWFLAKE_WAREHOUSE_SIZE=XSMALL
 
 BENCHMARK_TYPE=tpch
 DATASET_S3_BUCKET=embucket-testdata
-DATASET_NAME=tpch_data
-DATASET_SCALE_FACTOR=sf_01
+#dataset and scale factor path inside the s3 bucket
+DATASET_PATH=tpch/01
 
 EMBUCKET_ACCOUNT=embucket
 EMBUCKET_USER=embucket
diff --git a/benchmark/utils.py b/benchmark/utils.py
@@ -35,14 +35,13 @@ def create_embucket_connection():
     return conn
 
 
-
 def create_snowflake_connection():
     """Create Snowflake connection with environment-based config."""
     user = os.environ["SNOWFLAKE_USER"]
     password = os.environ["SNOWFLAKE_PASSWORD"]
     account = os.environ["SNOWFLAKE_ACCOUNT"]
-    database = os.environ["DATASET_NAME"]
-    schema = os.environ["DATASET_SCALE_FACTOR"]
+    database = os.environ["SNOWFLAKE_DATABASE"]
+    schema = os.environ["SNOWFLAKE_SCHEMA"]
     warehouse = os.environ["SNOWFLAKE_WAREHOUSE"]
 
     if not all([user, password, account, database, schema, warehouse]):
@@ -59,6 +58,9 @@ def create_snowflake_connection():
 
     conn = sf.connect(**connect_args)
 
+    conn.cursor().execute(f"CREATE OR REPLACE WAREHOUSE {warehouse} WITH WAREHOUSE_SIZE = '{os.environ['SNOWFLAKE_WAREHOUSE_SIZE']}';")
+    conn.cursor().execute(f"USE WAREHOUSE {warehouse};")
+
     conn.cursor().execute(f"CREATE DATABASE IF NOT EXISTS {database}")
     conn.cursor().execute(f"CREATE SCHEMA IF NOT EXISTS {schema}")
     conn.cursor().execute(f"USE SCHEMA {schema}")
@@ -67,3 +69,4 @@ def create_snowflake_connection():
     conn.cursor().execute("CREATE OR REPLACE TEMPORARY STAGE sf_prep_stage FILE_FORMAT = sf_parquet_format;")
 
     return conn
+