diff --git a/components/ingress/README b/components/ingress/README new file mode 100644 index 0000000000..93f6f33045 --- /dev/null +++ b/components/ingress/README @@ -0,0 +1,9 @@ +# Dynamo ingress / frontend node. + +Usage: `python -m dynamo.ingress [--http-port ]`. Port defaults to 8080. + +This runs an OpenAI compliant HTTP server, a pre-processor, and a router in a single process. Engines / workers are auto-discovered when they call `register_llm`. + +Requires `etcd` and `nats-server -js`. + +This is the same as `dynamo-run in=http out=dyn`. diff --git a/components/ingress/src/dynamo/ingress/__init__.py b/components/ingress/src/dynamo/ingress/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/components/ingress/src/dynamo/ingress/__main__.py b/components/ingress/src/dynamo/ingress/__main__.py new file mode 100644 index 0000000000..6e39e4ddd8 --- /dev/null +++ b/components/ingress/src/dynamo/ingress/__main__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from dynamo.ingress.main import main + +if __name__ == "__main__": + main() diff --git a/components/ingress/src/dynamo/ingress/main.py b/components/ingress/src/dynamo/ingress/main.py new file mode 100644 index 0000000000..a4b8c9ee3e --- /dev/null +++ b/components/ingress/src/dynamo/ingress/main.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Usage: `python -m dynamo.ingress [args]` +# +# Start a frontend node. This runs: +# - OpenAI HTTP server. +# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`). +# - Pre-processor: Prompt templating and tokenization. +# - Router, defaulting to round-robin (TODO: Add flags to enable KV routing). + +import argparse +import asyncio + +import uvloop + +from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input +from dynamo.runtime import DistributedRuntime + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Dynamo Frontend: HTTP+Pre-processor+Router", + formatter_class=argparse.RawTextHelpFormatter, # To preserve multi-line help formatting + ) + parser.add_argument( + "--kv-cache-block-size", type=int, help="KV cache block size (u32)." + ) + parser.add_argument( + "--http-port", type=int, default=8080, help="HTTP port for the engine (u16)." + ) + flags = parser.parse_args() + + kwargs = {"http_port": flags.http_port} + if flags.kv_cache_block_size is not None: + kwargs["kv_cache_block_size"] = flags.kv_cache_block_size + + return kwargs + + +async def async_main(): + runtime = DistributedRuntime(asyncio.get_running_loop(), False) + flags = parse_args() + + # out=dyn + e = EntrypointArgs(EngineType.Dynamic, **flags) + engine = await make_engine(runtime, e) + + # in=http + try: + await run_input(runtime, "http", engine) + except asyncio.exceptions.CancelledError: + pass + + +def main(): + uvloop.run(async_main()) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index aa623d8dfb..e9b12b8ac8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo"] +packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo"] # This section is for including the binaries in the wheel package # but doesn't make them executable scripts in the venv bin directory diff --git a/tests/serve/test_dynamo_serve.py b/tests/serve/test_dynamo_serve.py index f422757798..dbbc246d62 100644 --- a/tests/serve/test_dynamo_serve.py +++ b/tests/serve/test_dynamo_serve.py @@ -285,7 +285,6 @@ def __init__( (f"http://localhost:{port}/v1/models", self._check_model) ] health_check_ports = [port] - env = None self.port = port self.graph = graph @@ -305,7 +304,6 @@ def __init__( "from multiprocessing.spawn", ], log_dir=request.node.name, - env=env, # Pass the environment variables ) def _check_model(self, response):