Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multidataset example with deepspeed support #316

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/multidataset_deepspeed/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
./logs
*.out
64 changes: 64 additions & 0 deletions examples/multidataset_deepspeed/base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"Verbosity": {
"level": 2
},
"NeuralNetwork": {
"Architecture": {
"model_type": "EGNN",
"equivariance": true,
"radius": 5.0,
"max_neighbours": 100000,
"num_gaussians": 50,
"envelope_exponent": 5,
"int_emb_size": 64,
"basis_emb_size": 8,
"out_emb_size": 128,
"num_after_skip": 2,
"num_before_skip": 1,
"num_radial": 6,
"num_spherical": 7,
"num_filters": 126,
"edge_features": ["length"],
"hidden_dim": 50,
"num_conv_layers": 3,
"output_heads": {
"graph":{
"num_sharedlayers": 2,
"dim_sharedlayers": 50,
"num_headlayers": 2,
"dim_headlayers": [50,25]
},
"node": {
"num_headlayers": 2,
"dim_headlayers": [200,200],
"type": "mlp"
}
},
"task_weights": [1.0, 1.0]
},
"Variables_of_interest": {
"input_node_features": [0, 1, 2, 3],
"output_names": ["energy", "force"],
"output_index": [0, 2],
"output_dim": [1, 3],
"type": ["graph", "node"]
},
"Training": {
"num_epoch": 50,
"EarlyStopping": true,
"perc_train": 0.9,
"loss_function_type": "mae",
"batch_size": 32,
"continue": 0,
"Optimizer": {
"type": "AdamW",
"learning_rate": 1e-3
}
}
},
"Visualization": {
"plot_init_solution": true,
"plot_hist_solution": false,
"create_plots": true
}
}
116 changes: 116 additions & 0 deletions examples/multidataset_deepspeed/job-perlmutter-batch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash
#SBATCH -A m4716
#SBATCH -J HydraGNN
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 48:00:00
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -c 32

# Retrieve the number of nodes set via `sbatch -N` or in the script
echo "Number of nodes allocated: $SLURM_NNODES"

## Remove write permission for others in terms of newly created files and dirs
umask 002

## Load Basic Envs
module reset
module load pytorch/2.0.1

module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
module load hydragnn/pytorch2.0.1-v2
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
module load deepspeed

## MPI Envs
export MPICH_ENV_DISPLAY=0
export MPICH_VERSION_DISPLAY=0
export MPICH_GPU_SUPPORT_ENABLED=0

## HYDRAGNN Envs
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH

export HYDRAGNN_NUM_WORKERS=0
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
export HYDRAGNN_AGGR_BACKEND=mpi
export HYDRAGNN_VALTEST=1
export HYDRAGNN_TRACE_LEVEL=0

## Dataset Envs
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"

## Task 1: Outer loop WIDTH, Inner loop DEPTH, fixed DS, ZERO, and CKPT
for WIDTH in 800 1100 1700 2500; do
for DEPTH in 4 5 6; do
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-0.6_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
NS=$(echo "scale=0; 285715 / 1.2 * 0.6 * 32 / $SLURM_NNODES" | bc) # Fixed DS=0.6

## Handle optional arguments
EXTRA_ARGS="--zero_opt"

## Run script
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=${DEPTH} \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
done
done

## Task 2: Outer loop WIDTH, Inner loop DS, fixed DEPTH and ZERO, varying CKPT
for WIDTH in 2500 5400; do
for DS in 0.2 0.6 1.2; do
LOG_NAME="exp-3_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Dynamic DS

## Handle optional arguments
EXTRA_ARGS="--zero_opt"
if [ "$WIDTH" = "5400" ]; then
EXTRA_ARGS+=" --conv_checkpointing"
fi

## Run script
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=3 \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
done
done
86 changes: 86 additions & 0 deletions examples/multidataset_deepspeed/job-perlmutter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash
#SBATCH -A m4716
#SBATCH -J HydraGNN
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 48:00:00
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -c 32

# Retrieve the number of nodes set via `sbatch -N` or in the script
echo "Number of nodes allocated: $SLURM_NNODES"

WIDTH=${1:-50} # Default to 50 hidden_dim if not specified
DEPTH=${2:-3} # Default to 3 num_conv_layers if not specified
DS=${3:-1.2} # Default to 1.2TB data if not specified
ZERO=${4:-False} # Default to False if not specified
CKPT=${5:-False} # Default to False if not specified

## Remove write permission for others in terms of newly created files and dirs
umask 002

## Load Basic Envs
module reset
module load pytorch/2.0.1

module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
module load hydragnn/pytorch2.0.1-v2
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
module load deepspeed

## MPI Envs
export MPICH_ENV_DISPLAY=0
export MPICH_VERSION_DISPLAY=0
export MPICH_GPU_SUPPORT_ENABLED=0

## HYDRAGNN Envs
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH

export HYDRAGNN_NUM_WORKERS=0
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
export HYDRAGNN_AGGR_BACKEND=mpi
export HYDRAGNN_VALTEST=1
export HYDRAGNN_TRACE_LEVEL=0

## Dataset Envs
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"

## Log Envs
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size, default setting: 32 nodes with 32 batch size per GPU
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Calculate number of samples, default setting: 32 nodes with 285715 num_samples per GPU

## Handle optional arguments
EXTRA_ARGS=""
if [ "$ZERO" = "True" ]; then
EXTRA_ARGS+=" --zero_opt"
fi
if [ "$CKPT" = "True" ]; then
EXTRA_ARGS+=" --conv_checkpointing"
fi

## run scripts
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=${DEPTH} \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
29 changes: 29 additions & 0 deletions examples/multidataset_deepspeed/launch_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import subprocess
import argparse

def submit_job(nodes, width, depth, dataset_size, zero=False, ckpt=False):
# Command to execute
command = ["sbatch", "-N", str(nodes), "job-perlmutter.sh", str(width), str(depth), str(dataset_size), str(zero), str(ckpt)]
# Run the command and capture output
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
stdout, stderr = process.communicate()
# Extract the job ID
output = stdout.strip()
job_id = int(output.split()[-1])
return job_id

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Submit jobs with varying parameters.")
parser.add_argument("--width", type=int, required=True, help="Width of the model.")
parser.add_argument("--depth", type=int, required=True, help="Depth of the model.")
parser.add_argument("--zero", action="store_true", help="enable zero optimizer with stage 1", default=False)
parser.add_argument("--ckpt", action="store_true", help="enable checkpointing for conv layers", default=False)

args = parser.parse_args()

dataset_size_list = [0.1, 0.2, 0.4, 0.6]
nodes_list = [ 8, 16, 32, 32]

for dataset_size, nodes in zip(dataset_size_list, nodes_list):
job_id = submit_job(nodes, args.width, args.depth, dataset_size, args.zero, args.ckpt)
print(job_id)
Loading