fix: Make cli example more like dynamo-run

grahamking · grahamking · commit 5a33bce27e8f · 2025-07-08T17:01:58.000-04:00
diff --git a/examples/cli/cli.py b/examples/cli/cli.py
@@ -1,10 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-# Example cli using the Python bindings.
-# Usage: `python cli.py text mistralrs --model-path <your-model>`.
-# If `--model-path` not provided defaults to Qwen3 0.6B.
-# Must be in a virtualenv with the bindings (or wheel) installed.
+# Example cli using the Python bindings, similar to `dynamo-run`.
+# Usage: `python cli.py in=text out=mistralrs <your-model>`.
+# Must be in a virtualenv with the Dynamo bindings (or wheel) installed.
 
 import argparse
 import asyncio
@@ -18,34 +17,36 @@
 
 
 def parse_args():
-    """
-    Parses command-line arguments for the program.
-    """
+    in_mode = "text"
+    out_mode = "echo"
+    batch_file = None  # Specific to in_mode="batch"
+
+    # List to hold arguments that argparse will process (flags and model path)
+    argparse_args = []
+
+    # --- Step 1: Manual Pre-parsing for 'in=' and 'out=' ---
+    # Iterate through sys.argv[1:] to extract in= and out=
+    # and collect remaining arguments for argparse.
+    for arg in sys.argv[1:]:
+        if arg.startswith("in="):
+            in_val = arg[len("in=") :]
+            if in_val.startswith("batch:"):
+                in_mode = "batch"
+                batch_file = in_val[len("batch:") :]
+            else:
+                in_mode = in_val
+        elif arg.startswith("out="):
+            out_mode = arg[len("out=") :]
+        else:
+            # This argument is not 'in=' or 'out=', so it's either a flag or the model path
+            argparse_args.append(arg)
+
+    # --- Step 2: Argparse for flags and the model path ---
     parser = argparse.ArgumentParser(
-        description="Run a Dynamo LLM engine with configurable parameters.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,  # Show default values in help
+        description="Dynamo CLI: Connect inputs to an engine",
+        formatter_class=argparse.RawTextHelpFormatter,  # To preserve multi-line help formatting
     )
 
-    # Positional arguments (replacing sys.argv[1] and sys.argv[2])
-    parser.add_argument(
-        "input_source",
-        type=str,
-        help="Input source for the engine: 'text', 'http', 'stdin', 'batch:file.jsonl', 'dyn://<name>'",
-    )
-    parser.add_argument(
-        "output_type",
-        type=str,
-        help="Output type (engine type): 'echo', 'mistralrs', 'llamacpp', 'dyn'",
-    )
-
-    # Optional arguments corresponding to EntrypointArgs fields
-    # model_path: Option<PathBuf>
-    parser.add_argument(
-        "--model-path",
-        type=Path,
-        default=Path("Qwen/Qwen3-0.6B"),
-        help="Path to the model directory.",
-    )
     # model_name: Option<String>
     parser.add_argument("--model-name", type=str, help="Name of the model to load.")
     # model_config: Option<PathBuf>
@@ -69,8 +70,45 @@ def parse_args():
     # http_port: Option<u16>
     parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")
 
-    args = parser.parse_args()
-    return args
+    # TODO: Not yet used here
+    parser.add_argument(
+        "--tensor-parallel-size",
+        type=int,
+        help="Tensor parallel size for the model (e.g., 4).",
+    )
+
+    # Add the positional model argument.
+    # It's made optional (nargs='?') because its requirement depends on 'out_mode',
+    # which is handled in post-parsing validation.
+    parser.add_argument(
+        "model",
+        nargs="?",  # Make it optional for argparse, we'll validate manually
+        help="Path to the model (e.g., Qwen/Qwen3-0.6B).\n" "Required unless out=dyn.",
+    )
+
+    # Parse the arguments that were not 'in=' or 'out='
+    flags = parser.parse_args(argparse_args)
+
+    # --- Step 3: Post-parsing Validation and Final Assignment ---
+
+    # Validate 'batch' mode requires a file path
+    if in_mode == "batch" and not batch_file:
+        parser.error("Batch mode requires a file path: in=batch:FILE")
+
+    # Validate model path requirement based on 'out_mode'
+    if out_mode != "dyn" and flags.model is None:
+        parser.error("Model path is required unless out=dyn.")
+
+    # Consolidate all parsed arguments into a dictionary
+    parsed_args = {
+        "in_mode": in_mode,
+        "out_mode": out_mode,
+        "batch_file": batch_file,  # Will be None if in_mode is not "batch"
+        "model_path": flags.model,
+        "flags": flags,
+    }
+
+    return parsed_args
 
 
 async def run():
@@ -79,39 +117,39 @@ async def run():
 
     args = parse_args()
 
-    input = args.input_source
-    output = args.output_type
-
     engine_type_map = {
         "echo": EngineType.Echo,
         "mistralrs": EngineType.MistralRs,
         "llamacpp": EngineType.LlamaCpp,
         "dyn": EngineType.Dynamic,
     }
-    engine_type = engine_type_map.get(output)
+    out_mode = args["out_mode"]
+    engine_type = engine_type_map.get(out_mode)
     if engine_type is None:
-        print(f"Unsupported output type: {output}")
+        print(f"Unsupported output type: {out_mode}")
         sys.exit(1)
 
     # TODO: The "vllm", "sglang" and "trtllm" cases should call Python directly
 
-    entrypoint_kwargs = {"model_path": args.model_path}
-    if args.model_name is not None:
-        entrypoint_kwargs["model_name"] = args.model_name
-    if args.model_config is not None:
-        entrypoint_kwargs["model_config"] = args.model_config
-    if args.context_length is not None:
-        entrypoint_kwargs["context_length"] = args.context_length
-    if args.template_file is not None:
-        entrypoint_kwargs["template_file"] = args.template_file
-    if args.kv_cache_block_size is not None:
-        entrypoint_kwargs["kv_cache_block_size"] = args.kv_cache_block_size
-    if args.http_port is not None:
-        entrypoint_kwargs["http_port"] = args.http_port
+    entrypoint_kwargs = {"model_path": args["model_path"]}
+
+    flags = args["flags"]
+    if flags.model_name is not None:
+        entrypoint_kwargs["model_name"] = flags.model_name
+    if flags.model_config is not None:
+        entrypoint_kwargs["model_config"] = flags.model_config
+    if flags.context_length is not None:
+        entrypoint_kwargs["context_length"] = flags.context_length
+    if flags.template_file is not None:
+        entrypoint_kwargs["template_file"] = flags.template_file
+    if flags.kv_cache_block_size is not None:
+        entrypoint_kwargs["kv_cache_block_size"] = flags.kv_cache_block_size
+    if flags.http_port is not None:
+        entrypoint_kwargs["http_port"] = flags.http_port
 
     e = EntrypointArgs(engine_type, **entrypoint_kwargs)
     engine = await make_engine(runtime, e)
-    await run_input(runtime, input, engine)
+    await run_input(runtime, args["in_mode"], engine)
 
 
 if __name__ == "__main__":
diff --git a/lib/bindings/python/README.md b/lib/bindings/python/README.md
@@ -49,9 +49,11 @@ maturin develop --uv
 5. Experimental: To allow using mistral.rs and llama.cpp via the bindings, build with feature flags:
 
 ```
-maturin develop --features mistralrs,llamacpp
+maturin develop --features mistralrs,llamacpp --release
 ```
 
+`--release` is optional. It builds slower but the resulting library is significantly faster.
+
 See `examples/cli/cli.py` for usage.
 
 They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because