Skip to content

Commit

Permalink
pp
Browse files Browse the repository at this point in the history
  • Loading branch information
NouamaneTazi committed Dec 27, 2024
1 parent 4e075ab commit 67c5ebb
Show file tree
Hide file tree
Showing 14 changed files with 849 additions and 423 deletions.
4 changes: 2 additions & 2 deletions examples/config_tiny_llama_bench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ general:
lighteval: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
log_level: warning
log_level_replica: warning
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
Expand Down
25 changes: 18 additions & 7 deletions run_multinode.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/bin/bash

#SBATCH --job-name=smolm2-bench # Job name
#SBATCH --job-name=smolm2-bench # Job name
#SBATCH --time=00:02:00
#SBATCH --partition=hopper-prod
#SBATCH --qos=high
#SBATCH --reservation=huggingface_37
#SBATCH --reservation=nouamane_weekend_32
#SBATCH --exclude=ip-26-0-160-192,ip-26-0-171-102

#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%j-%x.out

Expand All @@ -13,6 +14,7 @@
#SBATCH --cpus-per-task=60 # CPU cores per task
#SBATCH --gres=gpu:8 # Number of GPUs per node
#SBATCH --exclusive # Exclusive use of nodes
#SBATCH --wait-all-nodes=1 # fail if any node is not ready

set -x -e

Expand All @@ -37,10 +39,18 @@ export WORLD_SIZE=$(($NNODES * $GPUS_PER_NODE))

# Set some environment variables for better distributed training
export CUDA_DEVICE_MAX_CONNECTIONS=1
# export NCCL_DEBUG=INFO
export NCCL_DEBUG=WARN # INFO

# Nanotron specific
export NANOTRON_BENCHMARK=1
# Disable wandb
export WANDB_MODE=disabled


# Print GPU topology information
echo "=== GPU Topology ==="
nvidia-smi topo -m
echo "=================="


# Print some debugging information
Expand All @@ -49,12 +59,13 @@ echo "All nodes: $NODELIST"
echo "World size: $WORLD_SIZE"

# Launch the training script using srun
srun torchrun \
srun --wait=0 --kill-on-bad-exit=1 torchrun \
--nnodes=$NNODES \
--nproc_per_node=$GPUS_PER_NODE \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint=$MASTER_NODE:$MASTER_PORT \
stress_test.py \
# run_train.py \
# --config-file examples/config_tiny_llama.yaml
--max_restarts 0 \
--rdzv_conf timeout=60 \
/fsx/nouamane/projects/nanotron/run_train.py \
--config-file examples/config_tiny_llama.yaml
Loading

0 comments on commit 67c5ebb

Please sign in to comment.