ORNL · licj15 · Dec 12, 2024
diff --git a/examples/multidataset_deepspeed/.gitignore b/examples/multidataset_deepspeed/.gitignore
@@ -0,0 +1,2 @@
+./logs
+*.out
diff --git a/examples/multidataset_deepspeed/base.json b/examples/multidataset_deepspeed/base.json
@@ -0,0 +1,64 @@
+{
+    "Verbosity": {
+        "level": 2
+    },
+   "NeuralNetwork": {
+        "Architecture": {
+            "model_type": "EGNN",
+            "equivariance": true,
+            "radius": 5.0,
+            "max_neighbours": 100000,
+            "num_gaussians": 50,
+            "envelope_exponent": 5,
+            "int_emb_size": 64,
+            "basis_emb_size": 8,
+            "out_emb_size": 128,
+            "num_after_skip": 2,
+            "num_before_skip": 1,
+            "num_radial": 6,
+            "num_spherical": 7,
+            "num_filters": 126,
+            "edge_features": ["length"],
+            "hidden_dim": 50,
+            "num_conv_layers": 3,
+            "output_heads": {
+                "graph":{
+                    "num_sharedlayers": 2,
+                    "dim_sharedlayers": 50,
+                    "num_headlayers": 2,
+                    "dim_headlayers": [50,25]
+                },
+                "node": {
+                    "num_headlayers": 2,
+                    "dim_headlayers": [200,200],
+                    "type": "mlp"
+                }
+            },
+            "task_weights": [1.0, 1.0]
+        },
+        "Variables_of_interest": {
+            "input_node_features": [0, 1, 2, 3],
+            "output_names": ["energy", "force"],
+            "output_index": [0, 2],
+            "output_dim": [1, 3],
+            "type": ["graph", "node"]
+        },
+        "Training": {
+            "num_epoch": 50,
+            "EarlyStopping": true,
+            "perc_train": 0.9,
+            "loss_function_type": "mae",
+            "batch_size": 32,
+            "continue": 0,
+            "Optimizer": {
+                "type": "AdamW",
+                "learning_rate": 1e-3
+            }
+        }
+    },
+    "Visualization": {
+        "plot_init_solution": true,
+        "plot_hist_solution": false,
+        "create_plots": true
+    }
+}
diff --git a/examples/multidataset_deepspeed/job-perlmutter-batch.sh b/examples/multidataset_deepspeed/job-perlmutter-batch.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+#SBATCH -A m4716
+#SBATCH -J HydraGNN
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 48:00:00
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-task=1
+#SBATCH -c 32
+
+# Retrieve the number of nodes set via `sbatch -N` or in the script
+echo "Number of nodes allocated: $SLURM_NNODES"
+
+## Remove write permission for others in terms of newly created files and dirs
+umask 002
+
+## Load Basic Envs
+module reset
+module load pytorch/2.0.1
+
+module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
+module load hydragnn/pytorch2.0.1-v2
+module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
+module load deepspeed
+
+## MPI Envs
+export MPICH_ENV_DISPLAY=0
+export MPICH_VERSION_DISPLAY=0
+export MPICH_GPU_SUPPORT_ENABLED=0
+
+## HYDRAGNN Envs
+HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
+export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH
+
+export HYDRAGNN_NUM_WORKERS=0
+export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
+export HYDRAGNN_AGGR_BACKEND=mpi
+export HYDRAGNN_VALTEST=1
+export HYDRAGNN_TRACE_LEVEL=0
+
+## Dataset Envs
+DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
+DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"
+
+## Task 1: Outer loop WIDTH, Inner loop DEPTH, fixed DS, ZERO, and CKPT
+for WIDTH in 800 1100 1700 2500; do
+    for DEPTH in 4 5 6; do
+        LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-0.6_TB_data-${SLURM_NNODES}_nodes"
+
+        ## Calculate batch size and num_samples
+        BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
+        NS=$(echo "scale=0; 285715 / 1.2 * 0.6 * 32 / $SLURM_NNODES" | bc) # Fixed DS=0.6
+
+        ## Handle optional arguments
+        EXTRA_ARGS="--zero_opt"
+
+        ## Run script
+        set -x
+
+        srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
+            python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
+                --inputfile=base.json \
+                --dataset_path=$DATASET_PATH \
+                --multi \
+                --multi_model_list=$DATASET_LIST \
+                --num_epoch=10 \
+                --everyone --ddstore \
+                --log=$LOG_NAME \
+                --hidden_dim=${WIDTH} \
+                --num_conv_layers=${DEPTH} \
+                --full_test \
+                --batch_size=${BS} \
+                --num_samples=${NS} \
+                ${EXTRA_ARGS}
+
+        set +x
+    done
+done
+
+## Task 2: Outer loop WIDTH, Inner loop DS, fixed DEPTH and ZERO, varying CKPT
+for WIDTH in 2500 5400; do
+    for DS in 0.2 0.6 1.2; do
+        LOG_NAME="exp-3_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"
+
+        ## Calculate batch size and num_samples
+        BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
+        NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Dynamic DS
+
+        ## Handle optional arguments
+        EXTRA_ARGS="--zero_opt"
+        if [ "$WIDTH" = "5400" ]; then
+            EXTRA_ARGS+=" --conv_checkpointing"
+        fi
+
+        ## Run script
+        set -x
+
+        srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
+            python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
+                --inputfile=base.json \
+                --dataset_path=$DATASET_PATH \
+                --multi \
+                --multi_model_list=$DATASET_LIST \
+                --num_epoch=10 \
+                --everyone --ddstore \
+                --log=$LOG_NAME \
+                --hidden_dim=${WIDTH} \
+                --num_conv_layers=3 \
+                --full_test \
+                --batch_size=${BS} \
+                --num_samples=${NS} \
+                ${EXTRA_ARGS}
+
+        set +x
+    done
+done
diff --git a/examples/multidataset_deepspeed/job-perlmutter.sh b/examples/multidataset_deepspeed/job-perlmutter.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH -A m4716
+#SBATCH -J HydraGNN
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 48:00:00
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-task=1
+#SBATCH -c 32
+
+# Retrieve the number of nodes set via `sbatch -N` or in the script
+echo "Number of nodes allocated: $SLURM_NNODES"
+
+WIDTH=${1:-50}     # Default to 50 hidden_dim if not specified
+DEPTH=${2:-3}      # Default to 3 num_conv_layers if not specified
+DS=${3:-1.2}       # Default to 1.2TB data if not specified
+ZERO=${4:-False}   # Default to False if not specified
+CKPT=${5:-False}   # Default to False if not specified
+
+## Remove write permission for others in terms of newly created files and dirs
+umask 002
+
+## Load Basic Envs
+module reset
+module load pytorch/2.0.1
+
+module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
+module load hydragnn/pytorch2.0.1-v2
+module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
+module load deepspeed
+
+## MPI Envs
+export MPICH_ENV_DISPLAY=0
+export MPICH_VERSION_DISPLAY=0
+export MPICH_GPU_SUPPORT_ENABLED=0
+
+## HYDRAGNN Envs
+HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
+export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH
+
+export HYDRAGNN_NUM_WORKERS=0
+export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
+export HYDRAGNN_AGGR_BACKEND=mpi
+export HYDRAGNN_VALTEST=1
+export HYDRAGNN_TRACE_LEVEL=0
+
+## Dataset Envs
+DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
+DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"
+
+## Log Envs
+LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"
+
+## Calculate batch size and num_samples
+BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size, default setting: 32 nodes with 32 batch size per GPU
+NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Calculate number of samples, default setting: 32 nodes with 285715 num_samples per GPU
+
+## Handle optional arguments
+EXTRA_ARGS=""
+if [ "$ZERO" = "True" ]; then
+    EXTRA_ARGS+=" --zero_opt"
+fi
+if [ "$CKPT" = "True" ]; then
+    EXTRA_ARGS+=" --conv_checkpointing"
+fi
+
+## run scripts
+set -x
+
+srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
+    python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
+        --inputfile=base.json \
+        --dataset_path=$DATASET_PATH \
+        --multi \
+        --multi_model_list=$DATASET_LIST \
+        --num_epoch=10 \
+        --everyone --ddstore \
+        --log=$LOG_NAME \
+        --hidden_dim=${WIDTH} \
+        --num_conv_layers=${DEPTH} \
+        --full_test \
+        --batch_size=${BS} \
+        --num_samples=${NS} \
+        ${EXTRA_ARGS}
+
+set +x
diff --git a/examples/multidataset_deepspeed/launch_helper.py b/examples/multidataset_deepspeed/launch_helper.py
@@ -0,0 +1,29 @@
+import subprocess
+import argparse
+
+def submit_job(nodes, width, depth, dataset_size, zero=False, ckpt=False):
+    # Command to execute
+    command = ["sbatch", "-N", str(nodes), "job-perlmutter.sh", str(width), str(depth), str(dataset_size), str(zero), str(ckpt)]
+    # Run the command and capture output
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    stdout, stderr = process.communicate()
+    # Extract the job ID
+    output = stdout.strip()
+    job_id = int(output.split()[-1])
+    return job_id
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Submit jobs with varying parameters.")
+    parser.add_argument("--width", type=int, required=True, help="Width of the model.")
+    parser.add_argument("--depth", type=int, required=True, help="Depth of the model.")
+    parser.add_argument("--zero", action="store_true", help="enable zero optimizer with stage 1", default=False)
+    parser.add_argument("--ckpt", action="store_true", help="enable checkpointing for conv layers", default=False)
+
+    args = parser.parse_args()
+
+    dataset_size_list = [0.1, 0.2, 0.4, 0.6]
+    nodes_list        = [  8,  16,  32,  32]
+
+    for dataset_size, nodes in zip(dataset_size_list, nodes_list):
+        job_id = submit_job(nodes, args.width, args.depth, dataset_size, args.zero, args.ckpt)
+        print(job_id)