2424logger  =  logging .getLogger (__name__ )
2525
2626
27- def  get_results_path (system : SystemType , benchmark_type : str , scale_factor : str ,
28-                      warehouse_or_instance : str , run_number : Optional [int ] =  None ) ->  str :
27+ def  get_results_path (system : SystemType , benchmark_type : str , dataset_path : str ,
28+                      instance : str ,  warehouse_size :  str   =   None , run_number : Optional [int ] =  None ) ->  str :
2929    """Generate path for storing benchmark results.""" 
3030    if  system  ==  SystemType .SNOWFLAKE :
31-         base_path  =  f"result/snowflake_{ benchmark_type }  _results/{ scale_factor }  /{ warehouse_or_instance }  " 
31+         # Use warehouse size in the path instead of warehouse name 
32+         base_path  =  f"result/snowflake_{ benchmark_type }  _results/{ dataset_path }  /{ warehouse_size }  " 
3233    elif  system  ==  SystemType .EMBUCKET :
33-         base_path  =  f"result/embucket_{ benchmark_type }  _results/{ scale_factor }  /{ warehouse_or_instance }  " 
34+         base_path  =  f"result/embucket_{ benchmark_type }  _results/{ dataset_path }  /{ instance }  " 
3435    else :
3536        raise  ValueError (f"Unsupported system: { system }  " )
3637
@@ -149,7 +150,7 @@ def run_on_sf(cursor, warehouse, tpch_queries):
149150    return  results 
150151
151152
152- def  run_on_emb (cursor ,  tpch_queries ):
153+ def  run_on_emb (tpch_queries ):
153154    """Run TPCH queries on Embucket with container restart before each query.""" 
154155    docker_manager  =  create_docker_manager ()
155156    executed_query_ids  =  []
@@ -271,11 +272,11 @@ def run_snowflake_benchmark(run_number: int):
271272    # Get benchmark configuration from environment variables 
272273    benchmark_type  =  os .environ .get ("BENCHMARK_TYPE" , "tpch" )
273274    warehouse  =  os .environ ["SNOWFLAKE_WAREHOUSE" ]
274-     dataset  =  os .environ ["DATASET_NAME " ]
275-     scale_factor  =  os .environ ["DATASET_SCALE_FACTOR " ]
275+     warehouse_size  =  os .environ ["SNOWFLAKE_WAREHOUSE_SIZE " ]
276+     dataset_path  =  os .environ ["DATASET_PATH " ]
276277
277278    logger .info (f"Starting Snowflake { benchmark_type }   benchmark run { run_number }  " )
278-     logger .info (f"Dataset: { dataset }  , Schema : { scale_factor }  , Warehouse : { warehouse }  " )
279+     logger .info (f"Dataset: { dataset_path }  , Warehouse : { warehouse }  , Size : { warehouse_size }  " )
279280
280281    # Get queries and run benchmark 
281282    queries  =  get_queries_for_benchmark (benchmark_type , for_embucket = False )
@@ -286,9 +287,9 @@ def run_snowflake_benchmark(run_number: int):
286287    # Disable query result caching for benchmark 
287288    sf_cursor .execute ("ALTER SESSION SET USE_CACHED_RESULT = FALSE;" )
288289
289-     sf_results  =  run_on_sf (sf_cursor ,warehouse , queries )
290+     sf_results  =  run_on_sf (sf_cursor ,  warehouse , queries )
290291
291-     results_path  =  get_results_path (SystemType .SNOWFLAKE , benchmark_type , scale_factor , warehouse , run_number )
292+     results_path  =  get_results_path (SystemType .SNOWFLAKE , benchmark_type , dataset_path , warehouse ,  warehouse_size , run_number )
292293    os .makedirs (os .path .dirname (results_path ), exist_ok = True )
293294    save_results_to_csv (sf_results , filename = results_path , system = SystemType .SNOWFLAKE )
294295
@@ -298,50 +299,49 @@ def run_snowflake_benchmark(run_number: int):
298299    sf_connection .close ()
299300
300301    # Check if we have 3 CSV files ready and calculate averages if so 
301-     results_dir  =  get_results_path (SystemType .SNOWFLAKE , benchmark_type , scale_factor , warehouse )
302+     results_dir  =  get_results_path (SystemType .SNOWFLAKE , benchmark_type , dataset_path , warehouse ,  warehouse_size )
302303    csv_files  =  glob .glob (os .path .join (results_dir , "snowflake_results_run_*.csv" ))
303304    if  len (csv_files ) ==  3 :
304305        logger .info ("Found 3 CSV files. Calculating averages..." )
305306        calculate_benchmark_averages (
306-             scale_factor ,
307-             warehouse , 
307+             dataset_path ,
308+             warehouse_size ,   # Pass warehouse size instead of name 
308309            SystemType .SNOWFLAKE ,
309310            benchmark_type 
310311        )
311312
312313    return  sf_results 
313314
314315
316+ 
315317def  run_embucket_benchmark (run_number : int ):
316318    """Run benchmark on Embucket with container restarts.""" 
317319    # Get benchmark configuration from environment variables 
318320    benchmark_type  =  os .environ .get ("BENCHMARK_TYPE" , "tpch" )
319321    instance  =  os .environ ["EMBUCKET_INSTANCE" ]
320-     dataset  =  os .environ .get ("EMBUCKET_DATASET" , os .environ ["DATASET_NAME" ])
321-     scale_factor  =  os .environ ["DATASET_SCALE_FACTOR" ]
322+     dataset_path  =  os .environ .get ("EMBUCKET_DATASET_PATH" , os .environ ["DATASET_PATH" ])
322323
323324    logger .info (f"Starting Embucket { benchmark_type }   benchmark run { run_number }  " )
324-     logger .info (f"Instance: { instance }  , Dataset: { dataset } , Scale Factor:  { scale_factor }  " )
325+     logger .info (f"Instance: { instance }  , Dataset: { dataset_path }  " )
325326
326327    # Get queries and docker manager 
327328    queries  =  get_queries_for_benchmark (benchmark_type , for_embucket = True )
328-     docker_manager  =  create_docker_manager ()
329329
330330    # Run benchmark 
331-     emb_results  =  run_on_emb (docker_manager ,  queries )
331+     emb_results  =  run_on_emb (queries )
332332
333-     results_path  =  get_results_path (SystemType .EMBUCKET , benchmark_type , scale_factor , instance , run_number )
333+     results_path  =  get_results_path (SystemType .EMBUCKET , benchmark_type , dataset_path , instance , run_number = run_number )
334334    os .makedirs (os .path .dirname (results_path ), exist_ok = True )
335335    save_results_to_csv (emb_results , filename = results_path , system = SystemType .EMBUCKET )
336336    logger .info (f"Embucket benchmark results saved to: { results_path }  " )
337337
338338    # Check if we have 3 CSV files ready and calculate averages 
339-     results_dir  =  get_results_path (SystemType .EMBUCKET , benchmark_type , scale_factor , instance )
339+     results_dir  =  get_results_path (SystemType .EMBUCKET , benchmark_type , dataset_path , instance )
340340    csv_files  =  glob .glob (os .path .join (results_dir , "embucket_results_run_*.csv" ))
341341    if  len (csv_files ) ==  3 :
342342        logger .info ("Found 3 CSV files. Calculating averages..." )
343343        calculate_benchmark_averages (
344-             scale_factor ,
344+             dataset_path ,
345345            instance ,
346346            SystemType .EMBUCKET ,
347347            benchmark_type 
@@ -398,8 +398,7 @@ def parse_args():
398398    parser .add_argument ("--platform" , choices = ["snowflake" , "embucket" , "both" ], default = "both" )
399399    parser .add_argument ("--runs" , type = int , default = 3 )
400400    parser .add_argument ("--benchmark-type" , choices = ["tpch" , "tpcds" ], default = os .environ .get ("BENCHMARK_TYPE" , "tpch" ))
401-     parser .add_argument ("--dataset-name" , help = "Override the DATASET_NAME environment variable" )
402-     parser .add_argument ("--scale-factor" , help = "Override the DATASET_SCALE_FACTOR environment variable" )
401+     parser .add_argument ("--dataset-path" , help = "Override the DATASET_PATH environment variable" )
403402    return  parser .parse_args ()
404403
405404
@@ -410,11 +409,8 @@ def parse_args():
410409    if  args .benchmark_type  !=  os .environ .get ("BENCHMARK_TYPE" , "tpch" ):
411410        os .environ ["BENCHMARK_TYPE" ] =  args .benchmark_type 
412411
413-     if  args .dataset_name :
414-         os .environ ["DATASET_NAME" ] =  args .dataset_name 
415- 
416-     if  args .scale_factor :
417-         os .environ ["DATASET_SCALE_FACTOR" ] =  args .scale_factor 
412+     if  args .dataset_path :
413+         os .environ ["DATASET_PATH" ] =  args .dataset_path 
418414
419415    # Execute benchmarks based on platform selection 
420416    if  args .platform  ==  "snowflake" :
0 commit comments