feat(frontend): dynamo frontend alternative to dynamo-run

grahamking · grahamking · commit 4b8c89080df6 · 2025-07-11T09:54:09.000-04:00
Instead of `dynamo-run in=http out=dyn` we can now do either:
- `dynamo frontend [--http-port 8080]` OR
- `python frontend.py`

No need to build or install the `dynamo-run` Rust binary. It is bundled
in `pip install ai-dynamo`.

This will have the same performance as `dynamo-run`, it uses the Rust
library.

In time we could split http server, pre-processor and router into
separate bindings, and call them here, allow easy customization.
diff --git a/deploy/sdk/src/dynamo/sdk/cli/cli.py b/deploy/sdk/src/dynamo/sdk/cli/cli.py
@@ -24,6 +24,7 @@
 
 from dynamo.sdk.cli.build import build
 from dynamo.sdk.cli.env import env
+from dynamo.sdk.cli.frontend import frontend
 from dynamo.sdk.cli.run import run
 from dynamo.sdk.cli.serve import serve
 
@@ -75,6 +76,10 @@ def main(
     add_help_option=False,
 )(run)
 cli.command()(build)
+cli.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    add_help_option=False,
+)(frontend)
 
 if __name__ == "__main__":
     cli()
diff --git a/deploy/sdk/src/dynamo/sdk/cli/frontend.py b/deploy/sdk/src/dynamo/sdk/cli/frontend.py
@@ -0,0 +1,68 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+# Start a frontend node. This runs:
+# - OpenAI HTTP server.
+# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
+# - Pre-processor: Prompt templating and tokenization.
+# - Router, defaulting to round-robin (TODO: Add flags to enable KV routing).
+
+import argparse
+import asyncio
+from typing import Optional
+
+import typer
+import uvloop
+
+from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
+from dynamo.runtime import DistributedRuntime
+
+
+# Called as `dynamo frontend`
+def frontend(
+    ctx: typer.Context, http_port: int = 8080, kv_cache_block_size: Optional[int] = None
+):
+    """Start an OpenAI compatible HTTP server"""
+
+    uvloop.run(main(http_port=http_port, kv_cache_block_size=kv_cache_block_size))
+
+
+# Called as `python frontend.py`
+def cli():
+    """Start an OpenAI compatible HTTP server"""
+    parser = argparse.ArgumentParser(
+        description="Dynamo Frontend: HTTP+Pre-processor+Router",
+        formatter_class=argparse.RawTextHelpFormatter,  # To preserve multi-line help formatting
+    )
+    parser.add_argument(
+        "--kv-cache-block-size", type=int, help="KV cache block size (u32)."
+    )
+    parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")
+    flags = parser.parse_args()
+
+    uvloop.run(
+        main(http_port=flags.http_port, kv_cache_block_size=flags.kv_cache_block_size)
+    )
+
+
+async def main(http_port=8080, kv_cache_block_size=None):
+    """Common entry point"""
+
+    # If we find cases where subprocess does not stop we may need this. Seem OK so far.
+    # atexit.register(cleanup_subprocess)
+
+    runtime = DistributedRuntime(asyncio.get_running_loop(), False)
+
+    entrypoint_kwargs = {"http_port": http_port}
+    if kv_cache_block_size:
+        entrypoint_kwargs["kv_cache_block_size"] = kv_cache_block_size
+
+    # out=dyn
+    e = EntrypointArgs(EngineType.Dynamic, **entrypoint_kwargs)
+    engine = await make_engine(runtime, e)
+    # in=http
+    await run_input(runtime, "http", engine)
+
+
+if __name__ == "__main__":
+    cli()