llamastack · yanxi0830 · Nov 15, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/src/llama_stack_client/lib/cli/common/utils.py b/src/llama_stack_client/lib/cli/common/utils.py
@@ -3,15 +3,28 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from tabulate import tabulate
+from rich.console import Console
+from rich.table import Table
 
 
-def print_table_from_response(response, headers=()):
-    if not headers:
-        headers = sorted(response[0].__dict__.keys())
+def create_bar_chart(data, labels, title=""):
+    """Create a bar chart using Rich Table."""
 
-    rows = []
-    for spec in response:
-        rows.append([spec.__dict__[headers[i]] for i in range(len(headers))])
+    console = Console()
+    table = Table(title=title)
+    table.add_column("Score")
+    table.add_column("Count")
 
-    print(tabulate(rows, headers=headers, tablefmt="grid"))
+    max_value = max(data)
+    total_count = sum(data)
+
+    # Define a list of colors to cycle through
+    colors = ["green", "blue", "red", "yellow", "magenta", "cyan"]
+
+    for i, (label, value) in enumerate(zip(labels, data)):
+        bar_length = int((value / max_value) * 20)  # Adjust bar length as needed
+        bar = "█" * bar_length + " " * (20 - bar_length)
+        color = colors[i % len(colors)]
+        table.add_row(label, f"[{color}]{bar}[/] {value}/{total_count}")
+
+    console.print(table)
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -9,8 +9,11 @@
 from typing import Optional
 
 import click
+from rich import print as rprint
 from tqdm.rich import tqdm
 
+from ..common.utils import create_bar_chart
+
 
 @click.command("run_benchmark")
 @click.argument("eval-task-ids", nargs=-1, required=True)
@@ -28,9 +31,20 @@
 @click.option(
     "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None
 )
+@click.option(
+    "--visualize",
+    is_flag=True,
+    default=False,
+    help="Visualize evaluation results after completion",
+)
 @click.pass_context
 def run_benchmark(
-    ctx, eval_task_ids: tuple[str, ...], eval_task_config: str, output_dir: str, num_examples: Optional[int]
+    ctx,
+    eval_task_ids: tuple[str, ...],
+    eval_task_config: str,
+    output_dir: str,
+    num_examples: Optional[int],
+    visualize: bool,
 ):
     """Run a evaluation benchmark"""
 
@@ -79,4 +93,13 @@ def run_benchmark(
         with open(output_file, "w") as f:
             json.dump(output_res, f, indent=2)
 
-        print(f"Results saved to: {output_file}")
+        rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
+
+        if visualize:
+            for scoring_fn in ["llm-as-judge::llm_as_judge_base"]:
+                res = output_res[scoring_fn]
+                assert len(res) > 0 and "score" in res[0]
+                scores = [r["score"] for r in res]
+                unique_scores = sorted(list(set([r["score"] for r in res])))
+                counts = [scores.count(s) for s in unique_scores]
+                create_bar_chart(counts, unique_scores, title=f"ScoringFunction = {scoring_fn}")
diff --git a/src/llama_stack_client/lib/cli/llama_stack_client.py b/src/llama_stack_client/lib/cli/llama_stack_client.py
@@ -57,7 +57,7 @@ def cli(ctx, endpoint: str, config: str | None):
         base_url=endpoint,
         provider_data={
             "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
-            "togethers_api_key": os.environ.get("TOGETHERS_API_KEY", ""),
+            "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
         },
     )
     ctx.obj = {"client": client}