Skip to content

Commit

Permalink
Fix use of deprecated Ray Tune environment variable (jpata#338)
Browse files Browse the repository at this point in the history
* chore: update raytune search space, utils and startscript

* fix: raytune deprecated env var for storage_path

Also add num samples to draw in HPO as cmd line arg

* chore: update clic config file for jureap57

* feat: script to build python env from scratch

* chore: update startscripts for raytrain and raytune

* fix CMS model path for ACAT2022

* MLPF datasets v2.0.0: track pythia-level genjets, genmet in datasets; add per-particle ispu flag (jpata#332)

* generate ttbar nopu events

* up

* update postprocessing

* small sample generation

* v3_1 run

* updates for CMSSE 14 generation

* [skip ci] cleanup postprocessing

* [skip ci] update pu gen

* update postprocessing with new truth definition based only on caloparticles

* remove pdb, switch genjet to energy

* [skip ci] prepare for v3_3

* [skip ci] fix flag

* added time and mem limits

* pu files from scratch

* 20240702_cptruthdef submission

* ttbar nopu v2

* up

* added genjet, genmet to clic postprocessing

* remove delphes

* update tests

* add postprocessing jobs

* update torch

* update dataset version

* propagate genjets, genmet

* shared memory error

* training on v2.0.0 for cms

* fix occasional root file load bug

* add jmenano

* fix qq

* clic training

* up

* CMS training instructions (jpata#336)

* CMS training instructions

* Update pyg-clic.yaml

* Update pyg-clic.yaml

* fix: black formatting

* Enable CI/CD test of HPO workflow

* fix: typo in test script

---------

Co-authored-by: Joosep Pata <joosep.pata@gmail.com>
  • Loading branch information
2 people authored and farakiko committed Aug 26, 2024
1 parent d5db3bf commit e5d8942
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 75 deletions.
26 changes: 6 additions & 20 deletions mlpf/pyg/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,8 @@ def train_ray_trial(config, args, outdir=None):

if outdir is None:
outdir = ray.train.get_context().get_trial_dir()
if not os.path.exists(outdir):
os.makedirs(outdir)

use_cuda = args.gpus > 0

Expand Down Expand Up @@ -970,11 +972,6 @@ def run_ray_training(config, args, outdir):
from ray import tune
from ray.train.torch import TorchTrainer

# create ray cache for intermediate storage of trials
tmp_ray_cache = TemporaryDirectory()
os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name
_logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}")

if not args.local:
ray.init(address="auto")

Expand Down Expand Up @@ -1027,9 +1024,6 @@ def run_ray_training(config, args, outdir):
_logger.info("Final val_reg_loss: {}".format(result.metrics["val_reg_loss"]), color="bold")
# _logger.info("Final val_charge_loss: {}".format(result.metrics["val_charge_loss"]), color="bold")

# clean up ray cache
tmp_ray_cache.cleanup()


def set_searchspace_and_run_trial(search_space, config, args):
import ray
Expand Down Expand Up @@ -1066,25 +1060,21 @@ def run_hpo(config, args):
from raytune.pt_search_space import raytune_num_samples, search_space
from raytune.utils import get_raytune_schedule, get_raytune_search_alg

# create ray cache for intermediate storage of trials
tmp_ray_cache = TemporaryDirectory()
os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name
_logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}")
if args.raytune_num_samples:
raytune_num_samples = args.raytune_num_samples # noqa: F811

name = args.hpo # name of Ray Tune experiment directory

os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1" # don't crash if a metric is missing
if isinstance(config["raytune"]["local_dir"], type(None)):
raise TypeError("Please specify a local_dir in the raytune section of the config file.")
trd = config["raytune"]["local_dir"] + "/tune_result_dir"
os.environ["TUNE_RESULT_DIR"] = trd

expdir = Path(config["raytune"]["local_dir"]) / name
expdir.mkdir(parents=True, exist_ok=True)
dirname = Path(config["raytune"]["local_dir"]) / name
shutil.copy(
"mlpf/raytune/search_space.py",
str(dirname / "search_space.py"),
"mlpf/raytune/pt_search_space.py",
str(dirname / "pt_search_space.py"),
) # Copy the search space definition file to the train dir for later reference
# Save config for later reference. Note that saving happens after parameters are overwritten by cmd line args.
with open((dirname / "config.yaml"), "w") as file:
Expand All @@ -1095,7 +1085,6 @@ def run_hpo(config, args):
ray.init(
address=os.environ["ip_head"],
_node_ip_address=os.environ["head_node_ip"],
# _temp_dir="/p/project/raise-ctp2/cern/tmp_ray",
)
_logger.info("Done.")

Expand Down Expand Up @@ -1158,6 +1147,3 @@ def run_hpo(config, args):

logging.info("Total time of Tuner.fit(): {}".format(end - start))
logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))

# clean up ray cache
tmp_ray_cache.cleanup()
1 change: 1 addition & 0 deletions mlpf/pyg_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
parser.add_argument("--local", action="store_true", default=None, help="perform HPO locally, without a Ray cluster")
parser.add_argument("--ray-cpus", type=int, default=None, help="CPUs per trial for HPO")
parser.add_argument("--ray-gpus", type=int, default=None, help="GPUs per trial for HPO")
parser.add_argument("--raytune-num-samples", type=int, default=None, help="Number of samples to draw from search space")
parser.add_argument("--comet", action="store_true", help="use comet ml logging")
parser.add_argument("--comet-offline", action="store_true", help="save comet logs locally")
parser.add_argument("--comet-step-freq", type=int, default=None, help="step frequency for saving comet metrics")
Expand Down
57 changes: 25 additions & 32 deletions mlpf/raytune/pt_search_space.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,44 @@
from ray.tune import grid_search # grid_search, choice, loguniform, quniform
from ray.tune import choice # grid_search, choice, loguniform, quniform

raytune_num_samples = 1 # Number of random samples to draw from search space. Set to 1 for grid search.
samp = grid_search
raytune_num_samples = 400 # Number of random samples to draw from search space. Set to 1 for grid search.
samp = choice

# gnn scan
search_space = {
# dataset parameters
"ntrain": samp([500]),
# "ntrain": samp([500]),
# "ntest": samp([10000]),
"nvalid": samp([500]),
"num_epochs": samp([10]),
# "nvalid": samp([500]),
# "num_epochs": samp([10]),
# optimizer parameters
"lr": samp([1e-4, 3e-4, 1e-3, 3e-3]),
"lr_schedule": samp(["onecycle"]),
"pct_start": samp([0.05]),
# "gpu_batch_multiplier": samp([1, 4, 8, 16]),
"lr": samp([1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]),
# "lr_schedule": samp(["onecycle"]),
# "pct_start": samp([0.0, 0.05, 0.1]),
"gpu_batch_multiplier": samp([1, 4, 8, 16]),
# "patience": samp([9999]),
# model arch parameters
# "activation": samp(["elu", "relu", "relu6", "leakyrelu"]),
"conv_type": samp(["attention"]), # can be "gnn_lsh", "gravnet", "attention"
"activation": samp(["elu", "relu", "relu6", "leakyrelu"]),
# "conv_type": samp(["attention"]), # can be "gnn_lsh", "gravnet", "attention"
# "embedding_dim": samp([32, 64, 128, 252, 512, 1024]),
# "width": samp([32, 64, 128, 256, 512, 1024]),
# "num_convs": samp([1, 2, 3, 4, 5, 6]),
"num_convs": samp([1, 2, 3, 4, 5]),
# "dropout": samp([0.0, 0.01, 0.1, 0.4]),
# only for gravnet
# "k": samp([8, 16, 32]),
# "propagate_dimensions": samp([8, 16, 32, 64, 128]),
# "space_dimensions": samp([4]),
# only for gnn-lsh
# "bin_size": samp([160, 320, 640]),
# "bin_size": samp([80, 160, 320, 640]),
# "max_num_bins": samp([200]),
# "distance_dim": samp([16, 32, 64, 128, 256]),
# "distance_dim": samp([128]),
# "layernorm": samp([True, False]),
# "num_node_messages": samp([1, 2, 3, 4, 5]),
# "ffn_dist_hidden_dim": samp([16, 32, 64, 128, 256]),
# "ffn_dist_num_layers": samp([1, 2, 3, 4, 5, 6]),
# "num_node_messages": samp([2]),
# "ffn_dist_hidden_dim": samp([64]),
# "ffn_dist_num_layers": samp([3]),
# mamba specific parameters
# "d_state": samp([16]),
# "d_conv": samp([4]),
# "expand": samp([2]),
# "num_heads": samp([2, 4, 6, 8, 10, 12]),
# attention specifica parameters
"num_heads": samp([2, 4, 8, 16]),
"num_heads": samp([4, 8, 16, 32, 64]),
"head_dim": samp([4, 8, 16, 32, 64]),
# "attention_type": samp(["flash"]), # flash, efficient, math
}

Expand All @@ -56,31 +53,27 @@ def set_hps_from_search_space(search_space, config):
conv_type = search_space["conv_type"]
config["conv_type"] = conv_type

common_varaible_names = ["embedding_dim", "width", "num_convs", "activation"]
common_varaible_names = ["num_convs", "activation"]
if conv_type == "gnn_lsh" or conv_type == "gravnet" or conv_type == "attention":
for var in common_varaible_names:
if var in search_space.keys():
config["model"][conv_type][var] = search_space[var]

gravnet_variable_names = ["k", "propagate_dimensions", "space_dimensions"]
if conv_type == "gravnet":
for var in gravnet_variable_names:
if var in search_space.keys():
config["model"][conv_type][var] = search_space[var]

attention_variables = ["num_heads"]
attention_variables = ["head_dim", "num_heads"]
if conv_type == "attention":
for var in attention_variables:
if var in search_space.keys():
config["model"][conv_type][var] = search_space[var]

mamba_variables = ["num_heads", "d_state", "d_conv", "expand"]
mamba_variables = ["width", "embedding_dim", "num_heads", "d_state", "d_conv", "expand"]
if conv_type == "mamba":
for var in mamba_variables:
if var in search_space.keys():
config["model"][conv_type][var] = search_space[var]

gnn_lsh_varaible_names = [
"width",
"embedding_dim",
"bin_size",
"max_num_bins",
"distance_dim",
Expand Down
16 changes: 0 additions & 16 deletions mlpf/raytune/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search.bohb import TuneBOHB
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.skopt import SkOptSearch

# from ray.tune.search.hebo import HEBOSearch # HEBO is not yet supported

Expand Down Expand Up @@ -50,21 +49,6 @@ def get_raytune_search_alg(raytune_cfg, seeds=False):
n_initial_points=raytune_cfg["hyperopt"]["n_random_steps"],
# points_to_evaluate=,
)
if raytune_cfg["search_alg"] == "scikit":
print("INFO: Using bayesian optimization from scikit-learn")
return SkOptSearch(
metric=raytune_cfg["default_metric"],
mode=raytune_cfg["default_mode"],
convert_to_python=True,
)
# HEBO is not yet supported
# if (raytune_cfg["search_alg"] == "hebo") or (raytune_cfg["search_alg"] == "HEBO"):
# print("Using HEBOSearch")
# return HEBOSearch(
# metric=raytune_cfg["default_metric"],
# mode=raytune_cfg["default_mode"],
# # max_concurrent=8,
# )
else:
print("INFO: Not using any Ray Tune search algorithm")
return None
Expand Down
6 changes: 3 additions & 3 deletions parameters/pytorch/pyg-clic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ num_epochs: 100
patience: 20
lr: 0.0001
lr_schedule: cosinedecay # constant, cosinedecay, onecycle
conv_type: gnn_lsh
conv_type: gnn_lsh # gnn_lsh, attention, mamba, flashattention
ntrain:
ntest:
nvalid:
Expand Down Expand Up @@ -80,8 +80,8 @@ lr_schedule_config:
pct_start: 0.3

raytune:
local_dir: # Note: please specify an absolute path
sched: asha # asha, hyperband
local_dir: # Note: please specify an absolute path
sched: # asha, hyperband
search_alg: # bayes, bohb, hyperopt, nevergrad, scikit
default_metric: "val_loss"
default_mode: "min"
Expand Down
40 changes: 40 additions & 0 deletions scripts/jureca/build_pip_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

# 2023-12-14
# Author: E. Wulff


module --force purge
ml Stages/2024 GCC/12.3.0 Python/3.11.3
ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024

jutil env activate -p jureap57

python3 -m venv ray_tune_env

source ray_tune_env/bin/activate

pip3 install --upgrade pip
pip3 install numpy<1.25
pip3 install pandas<1.6.0dev0
pip3 install scikit-learn
pip3 install matplotlib
pip3 install tqdm
pip3 install autopep8
pip3 install mplhep
pip3 install awkward
pip3 install fastjet
pip3 install comet-ml
pip3 install tensorflow_datasets==4.9.3
pip3 install torch torchvision
pip3 install hls4ml[profiling]
pip3 install torch_geometric
pip3 install ray[data,train,tune,serve]
pip3 install async_timeout
pip3 install numba
pip3 install hyperopt
pip3 install causal-conv1d==1.0.2
pip3 install mamba-ssm
pip3 install comet-ml

deactivate
109 changes: 109 additions & 0 deletions scripts/jureca/pt_raytrain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/sh

#SBATCH --account=jureap57
#SBATCH --partition=dc-gpu-devel
#SBATCH --time 2:00:00
#SBATCH --nodes 1
#SBATCH --tasks-per-node=1
#SBATCH --gres=gpu:4
#SBATCH --gpus-per-task=4
#SBATCH --cpus-per-task=128

# Job name
#SBATCH -J raytrain

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"


module --force purge
ml Stages/2024 GCC/12.3.0 Python/3.11.3
ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024

jutil env activate -p jureap57

source ray_tune_env/bin/activate

echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
echo "DEBUG: SLURM_NODEID: $SLURM_NODEID"
echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID"
echo "DEBUG: SLURM_PROCID: $SLURM_PROCID"
echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"

export CUDA_VISIBLE_DEVICES=0,1,2,3
num_gpus=${SLURM_GPUS_PER_TASK} # gpus per compute node
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work

## Limit number of max pending trials
export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4))

## Disable Ray Usage Stats
export RAY_USAGE_STATS_DISABLE=1


################# DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
# if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)

head_node=${nodes_array[0]}

port=7639

export ip_head="$head_node"i:"$port"
export head_node_ip="$head_node"i

echo "Starting HEAD at $head_node"
# apptainer exec --nv -B /p/project/jureap57/cern \
# apptainer/images/jureca_torch2307.sif \
srun --nodes=1 --ntasks=1 -w "$head_node" \
ray start --head --node-ip-address="$head_node"i --port=$port \
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block &
sleep 20

# number of nodes other than the head node
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Starting WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w "$node_i" \
ray start --address "$head_node"i:"$port" --redis-password='5241580000000000' \
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block &
sleep 10
done
echo All Ray workers started.
# fi
##############################################################################################

echo 'Starting training.'
# when training with Ray Train, --gpus should be equal to toal number of GPUs across the Ray Cluster
# apptainer exec --nv -B /p/project/jureap57/cern/data/tensorflow_datasets,/p/project/jureap57/cern/particleflow \
# apptainer/images/jureca_torch2307.sif \
python3 -u $PWD/mlpf/pyg_pipeline.py --train --ray-train \
--config $1 \
--prefix $2 \
--ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpu-batch-multiplier 8 \
--num-workers 8 \
--prefetch-factor 8 \
--experiments-dir /p/project/jureap57/cern/particleflow/experiments \
--local \
--ntrain 50000

echo 'Training done.'
Loading

0 comments on commit e5d8942

Please sign in to comment.