File tree Expand file tree Collapse file tree 3 files changed +4
-8
lines changed
components/backends/sglang Expand file tree Collapse file tree 3 files changed +4
-8
lines changed Original file line number Diff line number Diff line change @@ -115,11 +115,11 @@ Dynamo provides a simple way to spin up a local set of inference components incl
115115
116116```
117117# Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router:
118- python -m dynamo.frontend [ --http-port 8080]
118+ python -m dynamo.frontend --http-port 8080
119119
120120# Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these,
121121# both for the same model and for multiple models. The frontend node will discover them.
122- python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B
122+ python -m dynamo.sglang.worker --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --skip-tokenizer-init
123123```
124124
125125#### Send a Request
Original file line number Diff line number Diff line change @@ -67,8 +67,6 @@ docker run \
6767``` bash
6868# run ingress
6969python3 -m dynamo.frontend --http-port=8000 &
70- # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
71- python3 utils/sgl_http_server.py --ns dynamo &
7270# run prefill worker
7371SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
7472MC_TE_METRIC=true \
@@ -82,15 +80,14 @@ NCCL_CUMEM_ENABLE=1 \
8280SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
8381SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
8482PYTHONUNBUFFERED=1 \
85- python3 components/worker.py \
83+ python3 -m dynamo.sglang.worker \
8684 --served-model-name deepseek-ai/DeepSeek-R1 \
8785 --model-path /model/ \
8886 --skip-tokenizer-init \
8987 --trust-remote-code \
9088 --disaggregation-mode prefill \
9189 --dist-init-addr ${HEAD_PREFILL_NODE_IP} :29500 \
9290 --disaggregation-bootstrap-port 30001 \
93- --disaggregation-transfer-backend nixl \
9491 --nnodes 2 \
9592 --node-rank 0 \
9693 --tp-size 8 \
@@ -134,7 +131,7 @@ NCCL_CUMEM_ENABLE=1 \
134131SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
135132SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
136133PYTHONUNBUFFERED=1 \
137- python3 components/decode_worker.py \
134+ python3 -m dynamo.sglang.decode_worker \
138135 --served-model-name deepseek-ai/DeepSeek-R1 \
139136 --model-path /model/ \
140137 --skip-tokenizer-init \
Original file line number Diff line number Diff line change @@ -94,7 +94,6 @@ if [ "$mode" = "prefill" ]; then
9494 --disaggregation-mode prefill \
9595 --dist-init-addr " $HOST_IP :$PORT " \
9696 --disaggregation-bootstrap-port 30001 \
97- --disaggregation-transfer-backend nixl \
9897 --nnodes " $TOTAL_NODES " \
9998 --node-rank " $RANK " \
10099 --tp-size " $TOTAL_GPUS " \
You can’t perform that action at this time.
0 commit comments