@@ -142,6 +142,104 @@ def create_plot(
142142 print (f"Saved plot: { output_path } " )
143143
144144
145+ def create_efficiency_plot (
146+ deployment_results : Dict , plots_dir : Path , output_tokens : int = 200
147+ ) -> None :
148+ """
149+ Create an efficiency plot showing tok/s/gpu vs tok/s/user with concurrency as labeled points.
150+
151+ Args:
152+ deployment_results: Dict of deployment_type -> results
153+ plots_dir: Directory to save plots
154+ output_tokens: Average output tokens per request (default 200)
155+ """
156+ plt .figure (figsize = (12 , 8 ))
157+
158+ colors = {"agg" : "#1f77b4" , "disagg" : "#ff7f0e" , "vanilla" : "#2ca02c" }
159+ markers = {"agg" : "o" , "disagg" : "s" , "vanilla" : "^" }
160+
161+ for deployment_type , results in deployment_results .items ():
162+ tok_s_per_user = []
163+ tok_s_per_gpu = []
164+ concurrency_levels = []
165+
166+ for concurrency , metrics in results :
167+ try :
168+ # Get request throughput (requests/sec)
169+ request_throughput = metrics ["request_throughput" ]["avg" ]
170+
171+ # Calculate total tokens per second
172+ total_tok_s = request_throughput * output_tokens
173+
174+ # Calculate tok/s per user and per GPU
175+ tok_s_user = total_tok_s / concurrency
176+ tok_s_gpu = total_tok_s # Assuming 1 GPU per deployment
177+
178+ tok_s_per_user .append (tok_s_user )
179+ tok_s_per_gpu .append (tok_s_gpu )
180+ concurrency_levels .append (concurrency )
181+
182+ except KeyError as e :
183+ print (
184+ f"Warning: Missing metric for { deployment_type } concurrency { concurrency } : { e } "
185+ )
186+ continue
187+
188+ if tok_s_per_user and tok_s_per_gpu :
189+ # Plot points
190+ color = colors .get (deployment_type , "#888888" )
191+ marker = markers .get (deployment_type , "o" )
192+
193+ plt .scatter (
194+ tok_s_per_user ,
195+ tok_s_per_gpu ,
196+ c = color ,
197+ marker = marker ,
198+ s = 120 ,
199+ alpha = 0.8 ,
200+ label = deployment_type .title (),
201+ edgecolors = "black" ,
202+ linewidth = 1.5 ,
203+ )
204+
205+ # Add concurrency labels
206+ for i , (x , y , c ) in enumerate (
207+ zip (tok_s_per_user , tok_s_per_gpu , concurrency_levels )
208+ ):
209+ plt .annotate (
210+ f"{ c } " ,
211+ (x , y ),
212+ xytext = (8 , 8 ),
213+ textcoords = "offset points" ,
214+ fontsize = 10 ,
215+ fontweight = "bold" ,
216+ ha = "left" ,
217+ )
218+
219+ plt .title ("GPU Efficiency vs User Experience" , fontsize = 14 , fontweight = "bold" )
220+ plt .xlabel ("Tokens/sec per User" , fontsize = 12 )
221+ plt .ylabel ("Tokens/sec per GPU" , fontsize = 12 )
222+ plt .grid (True , alpha = 0.3 )
223+
224+ # Add a note about what the numbers represent
225+ plt .figtext (
226+ 0.02 ,
227+ 0.02 ,
228+ "Note: Numbers on dots indicate concurrency level" ,
229+ fontsize = 10 ,
230+ style = "italic" ,
231+ alpha = 0.7 ,
232+ )
233+
234+ plt .legend ()
235+
236+ plt .tight_layout ()
237+ output_path = plots_dir / "efficiency_tok_s_gpu_vs_user.png"
238+ plt .savefig (output_path , dpi = 300 , bbox_inches = "tight" )
239+ plt .close ()
240+ print (f"Saved efficiency plot: { output_path } " )
241+
242+
145243def generate_plots (base_output_dir : Path ) -> None :
146244 """
147245 Generate performance plots from benchmark results.
@@ -248,6 +346,9 @@ def generate_plots(base_output_dir: Path) -> None:
248346 log_scale_x = True ,
249347 )
250348
349+ # 5. Efficiency plot: tok/s/gpu vs tok/s/user
350+ create_efficiency_plot (deployment_results , plots_dir )
351+
251352 # Generate summary
252353 summary_lines = [
253354 "Benchmark Results Summary" ,
@@ -273,6 +374,7 @@ def generate_plots(base_output_dir: Path) -> None:
273374 " - avg_inter_token_latency_vs_concurrency.png" ,
274375 " - request_throughput_vs_concurrency.png" ,
275376 " - avg_time_to_first_token_vs_concurrency.png" ,
377+ " - efficiency_tok_s_gpu_vs_user.png" ,
276378 ]
277379 )
278380
0 commit comments