llamastack · yanxi0830 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/src/llama_stack_client/lib/cli/inference/__init__.py b/src/llama_stack_client/lib/cli/inference/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .inference import inference
+
+__all__ = ["inference"]
diff --git a/src/llama_stack_client/lib/cli/inference/inference.py b/src/llama_stack_client/lib/cli/inference/inference.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+import click
+import yaml
+from rich.console import Console
+from rich.table import Table
+
+from ..common.utils import handle_client_errors
+
+
+@click.group()
+def inference():
+    """Query details about available inference endpoints on distribution."""
+    pass
+
+
+@click.command("chat-completion")
+@click.option("--message", required=True, help="Message")
+@click.option("--stream", is_flag=True, help="Streaming", default=False)
+@click.option("--model-id", required=False, help="Model ID")
+@click.pass_context
+@handle_client_errors("inference chat-completion")
+def chat_completion(ctx, message: str, stream: bool, model_id: Optional[str]):
+    """Show available inference chat completion endpoints on distribution endpoint"""
+    client = ctx.obj["client"]
+    console = Console()
+
+    if not model_id:
+        available_models = [model.identifier for model in client.models.list()]
+        model_id = available_models[0]
+
+    response = client.inference.chat_completion(
+        model_id=model_id,
+        messages=[{"role": "user", "content": message}],
+        stream=stream,
+    )
+    if not stream:
+        console.print(response)
+    else:
+        for chunk in response:
+            if chunk.event.event_type == "complete":
+                console.print(chunk.event.delta)
+            elif chunk.event.event_type == "progress":
+                console.print(chunk.event.delta, end="")
+
+
+# Register subcommands
+inference.add_command(chat_completion)
diff --git a/src/llama_stack_client/lib/cli/llama_stack_client.py b/src/llama_stack_client/lib/cli/llama_stack_client.py
@@ -16,12 +16,12 @@
 from .datasets import datasets
 from .eval import eval
 from .eval_tasks import eval_tasks
+from .inference import inference
 from .memory_banks import memory_banks
 from .models import models
 
 from .providers import providers
 from .scoring_functions import scoring_functions
-
 from .shields import shields
 
 
@@ -73,6 +73,7 @@ def cli(ctx, endpoint: str, config: str | None):
 cli.add_command(configure, "configure")
 cli.add_command(scoring_functions, "scoring_functions")
 cli.add_command(eval, "eval")
+cli.add_command(inference, "inference")
 
 
 def main():