1+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ import os
16+ import logging
17+ from typing import Dict , Any , List
18+
19+ from benchmark_framework import ModelBenchmark
20+
21+ import torch
22+
23+ os .environ ["HF_HUB_ENABLE_HF_TRANSFER" ] = "1"
24+ os .environ ["TOKENIZERS_PARALLELISM" ] = "1"
25+ torch .set_float32_matmul_precision ("high" )
26+
27+ class LLaMABenchmark (ModelBenchmark ):
28+ """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
29+
30+ def __init__ (self , logger : logging .Logger ):
31+ super ().__init__ (logger )
32+ self ._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
33+
34+
35+
36+ def get_scenario_configs (self ) -> List [Dict [str , Any ]]:
37+ """
38+ Get LLaMA-specific scenario configurations.
39+
40+ Returns:
41+ List of scenario configuration dictionaries
42+ """
43+ return [
44+ # Eager variants
45+ {"variant" : "eager" , "compile_mode" : None , "use_cache" : True , "description" : "Eager execution with cache" },
46+
47+ # Compiled variants
48+ {"variant" : "compiled" , "compile_mode" : "max-autotune" , "use_cache" : True , "description" : "Compiled with max autotune" },
49+
50+ # Kernelized variant (if available)
51+ {"variant" : "kernelized" , "compile_mode" : "max-autotune" , "use_cache" : True , "description" : "Kernelized execution" },
52+ ]
53+
54+ def _is_kernelization_available (self ) -> bool :
55+ """Check if kernelization is available for LLaMA."""
56+ try :
57+ from kernels import Mode , kernelize
58+ return True
59+ except ImportError :
60+ self .logger .debug ("Kernelization not available: kernels module not found" )
61+ return False
62+
63+ def get_default_generation_config (self ) -> Dict [str , Any ]:
64+ """Get LLaMA-specific generation configuration."""
65+ return {
66+ "do_sample" : False ,
67+ "top_p" : 1.0 ,
68+ "temperature" : 1.0 ,
69+ "repetition_penalty" : 1.0 ,
70+ "max_new_tokens" : None , # Will be set per scenario
71+ }
72+
73+ def get_model_init_kwargs (self , config ) -> Dict [str , Any ]:
74+ """Get LLaMA-specific model initialization kwargs."""
75+ from benchmark_framework import BenchmarkConfig
76+ return {
77+ "torch_dtype" : getattr (torch , config .torch_dtype ),
78+ "attn_implementation" : config .attn_implementation ,
79+ "use_cache" : True ,
80+ }
81+
82+ def get_default_torch_dtype (self ) -> str :
83+ """Get default torch dtype for LLaMA."""
84+ return "float16" # LLaMA works well with float16
85+
86+ def get_default_device (self ) -> str :
87+ """Get default device for LLaMA."""
88+ return "cuda" # LLaMA prefers CUDA
89+
90+
91+ def run_llama (logger , output_dir , ** kwargs ):
92+ """
93+ Run LLaMA benchmark with the given configuration.
94+
95+ Args:
96+ logger: Logger instance
97+ output_dir: Output directory for results
98+ **kwargs: Additional configuration options
99+
100+ Returns:
101+ Path to output file if successful
102+ """
103+ from benchmark_framework import BenchmarkRunner
104+
105+ # Extract parameters with defaults
106+ model_id = kwargs .get ('model_id' , 'meta-llama/Llama-2-7b-hf' )
107+ warmup_iterations = kwargs .get ('warmup_iterations' , 3 )
108+ measurement_iterations = kwargs .get ('measurement_iterations' , 5 )
109+ num_tokens_to_generate = kwargs .get ('num_tokens_to_generate' , 100 )
110+ include_sdpa_variants = kwargs .get ('include_sdpa_variants' , True )
111+ device = kwargs .get ('device' , 'cuda' )
112+ torch_dtype = kwargs .get ('torch_dtype' , 'float16' )
113+ batch_size = kwargs .get ('batch_size' , 1 )
114+ commit_id = kwargs .get ('commit_id' , None )
115+
116+ logger .info (f"Starting LLaMA benchmark for model: { model_id } " )
117+ logger .info (f"Configuration: warmup={ warmup_iterations } , measurement={ measurement_iterations } , tokens={ num_tokens_to_generate } " )
118+
119+ try :
120+ # Create benchmark instance
121+ benchmark = LLaMABenchmark (logger )
122+
123+ # Create scenarios
124+ scenarios = benchmark .create_scenarios (
125+ model_id = model_id ,
126+ warmup_iterations = warmup_iterations ,
127+ measurement_iterations = measurement_iterations ,
128+ num_tokens_to_generate = num_tokens_to_generate ,
129+ include_sdpa_variants = include_sdpa_variants ,
130+ device = device ,
131+ torch_dtype = torch_dtype ,
132+ batch_size = batch_size
133+ )
134+
135+ logger .info (f"Created { len (scenarios )} benchmark scenarios" )
136+
137+ # Create runner and execute benchmarks
138+ runner = BenchmarkRunner (logger , output_dir )
139+ results = runner .run_benchmark (benchmark , scenarios , commit_id = commit_id )
140+
141+ if not results :
142+ logger .warning ("No successful benchmark results" )
143+ return None
144+
145+ # Save results
146+ model_name = model_id .split ('/' )[- 1 ] # Extract model name from ID
147+ output_file = runner .save_results (model_name , results )
148+
149+ logger .info (f"LLaMA benchmark completed successfully. Results saved to: { output_file } " )
150+ return output_file
151+
152+ except Exception as e :
153+ logger .error (f"LLaMA benchmark failed: { e } " )
154+ import traceback
155+ logger .debug (traceback .format_exc ())
156+ raise
0 commit comments