#!/bin/bash #SBATCH -t 2:00:00 #SBATCH -N 2 #SBATCH -p a100 #SBATCH --gpus-per-node=2 GCC_VERSION="10.3.0" CUDA_VERSION="11.7" module purge module load gcc/$GCC_VERSION module load cmake/3.22.2 module load cuda/$CUDA_VERSION export CUDA_HOME=/opt/cuda/$CUDA_VERSION export CPATH=$CUDA_HOME/include:$CPATH export CUDNN_LIB_DIR=/home/gulhane.2/cuda/lib64 export CUDNN_INCLUDE_DIR=/home/gulhane.2/cuda/include export MV2_HOME=/home/gulhane.2/mvapich2-installation/nvidia/gdr2.3.7_cuda11.7_gcc10.3.0_latest source /home/gulhane.2/mvapich2-installation/nvidia/gdr2.3.7_cuda11.7_gcc10.3.0_latest/setup.sh module load cuda/$CUDA_VERSION export PATH=$MV2_HOME/bin:$PATH export LD_LIBRARY_PATH=$MV2_HOME/lib:$LD_LIBRARY_PATH export LD_PRELOAD=$MV2_HOME/lib/libmpi.so source $HOME/miniconda3/bin/activate # conda activate megatron-mcr-dl conda activate PyTorch_2.0.1 # megatron-mcr-dl-> pytorch is not installed from source # PyTorch_2.0.1-> pytorch is installed from source OUTFILE="/home/gulhane.2/Megatron-LM-MCR-DL/sbatch_scripts/distributed_run_pretrain_gpt.log" # cd /home/gulhane.2/Megatron-LM-MCR-DL/Megatron-LM/ # sh examples/pretrain_gpt.sh &>> $OUTFILE 2>&1 nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) node_array=($nodes) head_node=${node_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) echo $head_node_ip &>> $OUTFILE 2>&1 cd /home/gulhane.2/Megatron-LM-MCR-DL/Megatron-LM/ srun sh examples/pretrain_gpt_distributed.sh $head_node_ip &>> $OUTFILE 2>&1