fix: readme instructions for worker running (#2266)

ishandhanani · web-flow · commit 053ac33e1c31 · 2025-08-04T09:59:02.000-07:00
diff --git a/README.md b/README.md
@@ -115,11 +115,11 @@ Dynamo provides a simple way to spin up a local set of inference components incl
 
 ```
 # Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router:
-python -m dynamo.frontend [--http-port 8080]
+python -m dynamo.frontend --http-port 8080
 
 # Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these,
 # both for the same model and for multiple models. The frontend node will discover them.
-python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+python -m dynamo.sglang.worker --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --skip-tokenizer-init
 ```
 
 #### Send a Request
diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -67,8 +67,6 @@ docker run \
 ```bash
 # run ingress
 python3 -m dynamo.frontend --http-port=8000 &
-# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
-python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
 SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
 MC_TE_METRIC=true \
@@ -82,15 +80,14 @@ NCCL_CUMEM_ENABLE=1 \
 SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
 SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
 PYTHONUNBUFFERED=1 \
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
   --served-model-name deepseek-ai/DeepSeek-R1 \
   --model-path /model/ \
   --skip-tokenizer-init \
   --trust-remote-code \
   --disaggregation-mode prefill \
   --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \
   --disaggregation-bootstrap-port 30001 \
-  --disaggregation-transfer-backend nixl \
   --nnodes 2 \
   --node-rank 0 \
   --tp-size 8 \
@@ -134,7 +131,7 @@ NCCL_CUMEM_ENABLE=1 \
 SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
 SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
 PYTHONUNBUFFERED=1 \
-python3 components/decode_worker.py \
+python3 -m dynamo.sglang.decode_worker \
   --served-model-name deepseek-ai/DeepSeek-R1 \
   --model-path /model/ \
   --skip-tokenizer-init \
diff --git a/components/backends/sglang/slurm_jobs/scripts/gb200.sh b/components/backends/sglang/slurm_jobs/scripts/gb200.sh
@@ -94,7 +94,6 @@ if [ "$mode" = "prefill" ]; then
             --disaggregation-mode prefill \
             --dist-init-addr "$HOST_IP:$PORT" \
             --disaggregation-bootstrap-port 30001 \
-            --disaggregation-transfer-backend nixl \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \