feat(frontend): python frontend.py alternative to dynamo-run

grahamking · grahamking · commit bd71f35f93b6 · 2025-07-11T13:02:43.000-04:00
No need to build or install the `dynamo-run` Rust binary.

This will have the same performance as `dynamo-run`, it uses the Rust
library.

In time we could split http server, pre-processor and router into
separate bindings, and call them here, allow easy customization.
diff --git a/components/frontend/README b/components/frontend/README
@@ -0,0 +1,6 @@
+Dynamo ingress / frontend node.
+
+This runs an OpenAI compliant HTTP server, a pre-processor, and a router in a single process. Engines / workers are auto-discovered when they call `register_llm`.
+
+Requires `etcd` and `nats-server -js`.
+
diff --git a/components/frontend/main.py b/components/frontend/main.py
@@ -0,0 +1,57 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+# Start a frontend node. This runs:
+# - OpenAI HTTP server.
+# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
+# - Pre-processor: Prompt templating and tokenization.
+# - Router, defaulting to round-robin (TODO: Add flags to enable KV routing).
+
+import argparse
+import asyncio
+
+import uvloop
+
+from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
+from dynamo.runtime import DistributedRuntime
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Dynamo Frontend: HTTP+Pre-processor+Router",
+        formatter_class=argparse.RawTextHelpFormatter,  # To preserve multi-line help formatting
+    )
+    parser.add_argument(
+        "--kv-cache-block-size", type=int, help="KV cache block size (u32)."
+    )
+    parser.add_argument(
+        "--http-port", type=int, default=8080, help="HTTP port for the engine (u16)."
+    )
+    flags = parser.parse_args()
+
+    kwargs = {}
+    if flags.http_port is not None:
+        kwargs["http_port"] = flags.http_port
+    if flags.kv_cache_block_size is not None:
+        kwargs["kv_cache_block_size"] = flags.kv_cache_block_size
+
+    return kwargs
+
+
+async def main():
+    runtime = DistributedRuntime(asyncio.get_running_loop(), False)
+    flags = parse_args()
+
+    # out=dyn
+    e = EntrypointArgs(EngineType.Dynamic, **flags)
+    engine = await make_engine(runtime, e)
+
+    # in=http
+    try:
+        await run_input(runtime, "http", engine)
+    except asyncio.exceptions.CancelledError:
+        pass
+
+
+if __name__ == "__main__":
+    uvloop.run(main())