diff --git a/examples/deepspeed-multinode/sky.yaml b/examples/deepspeed-multinode/sky.yaml index 378992d66a4..37d7445a2a1 100644 --- a/examples/deepspeed-multinode/sky.yaml +++ b/examples/deepspeed-multinode/sky.yaml @@ -18,8 +18,15 @@ resources: # accelerators: A100-80GB:1 # Azure, GCP, SCP # accelerators: A10G:1 # AWS. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh. # accelerators: T4:1 # AWS, Azure, GCP. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh. + num_nodes: 2 +envs: + MY_VAR_1: "hello" + MY_VAR_2: "world" + # List of env vars to propagate to all nodes in deepspeed. If you add an env above, add it to this list. + DEEPSPEED_ENVS: "MY_VAR_1,MY_VAR_2,SKYPILOT_NODE_RANK" + setup: | git clone https://github.com/microsoft/DeepSpeedExamples.git || true cd DeepSpeedExamples @@ -60,6 +67,10 @@ run: | HOSTFILE_PATH=/tmp/hostfile.${SKYPILOT_TASK_ID} python -c "import os;n_gpus=os.environ['SKYPILOT_NUM_GPUS_PER_NODE'];print('\n'.join([f'{ip} slots={n_gpus}' for ip in os.environ['SKYPILOT_NODE_IPS'].splitlines()]))" > ${HOSTFILE_PATH} + # Generate .deepspeed_env to propagate env vars to all workers spawned by DeepSpeed. + echo "Generating .deepspeed_env" + python3 -c 'import os; f = open(".deepspeed_env", "w"); f.write("\n".join(["{}=\"{}\"".format(var, os.getenv(var, "")) for var in os.getenv("DEEPSPEED_ENVS").split(",")])); f.write("\n"); f.close()' + echo "*******************************************" echo "Hostfile: ${HOSTFILE_PATH}" cat ${HOSTFILE_PATH}