Fix use of deprecated Ray Tune environment variable (jpata#338)

* chore: update raytune search space, utils and startscript * fix: raytune deprecated env var for storage_path Also add num samples to draw in HPO as cmd line arg * chore: update clic config file for jureap57 * feat: script to build python env from scratch * chore: update startscripts for raytrain and raytune * fix CMS model path for ACAT2022 * MLPF datasets v2.0.0: track pythia-level genjets, genmet in datasets; add per-particle ispu flag (jpata#332) * generate ttbar nopu events * up * update postprocessing * small sample generation * v3_1 run * updates for CMSSE 14 generation * [skip ci] cleanup postprocessing * [skip ci] update pu gen * update postprocessing with new truth definition based only on caloparticles * remove pdb, switch genjet to energy * [skip ci] prepare for v3_3 * [skip ci] fix flag * added time and mem limits * pu files from scratch * 20240702_cptruthdef submission * ttbar nopu v2 * up * added genjet, genmet to clic postprocessing * remove delphes * update tests * add postprocessing jobs * update torch * update dataset version * propagate genjets, genmet * shared memory error * training on v2.0.0 for cms * fix occasional root file load bug * add jmenano * fix qq * clic training * up * CMS training instructions (jpata#336) * CMS training instructions * Update pyg-clic.yaml * Update pyg-clic.yaml * fix: black formatting * Enable CI/CD test of HPO workflow * fix: typo in test script --------- Co-authored-by: Joosep Pata <joosep.pata@gmail.com>
farakiko · Aug 26, 2024 · e5d8942 · e5d8942
1 parent d5db3bf
commit e5d8942
Show file tree

Hide file tree

Showing 9 changed files with 290 additions and 75 deletions.
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
@@ -855,6 +855,8 @@ def train_ray_trial(config, args, outdir=None):
 
     if outdir is None:
         outdir = ray.train.get_context().get_trial_dir()
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
 
     use_cuda = args.gpus > 0
 
@@ -970,11 +972,6 @@ def run_ray_training(config, args, outdir):
     from ray import tune
     from ray.train.torch import TorchTrainer
 
-    # create ray cache for intermediate storage of trials
-    tmp_ray_cache = TemporaryDirectory()
-    os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name
-    _logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}")
-
     if not args.local:
         ray.init(address="auto")
 
@@ -1027,9 +1024,6 @@ def run_ray_training(config, args, outdir):
     _logger.info("Final val_reg_loss: {}".format(result.metrics["val_reg_loss"]), color="bold")
     # _logger.info("Final val_charge_loss: {}".format(result.metrics["val_charge_loss"]), color="bold")
 
-    # clean up ray cache
-    tmp_ray_cache.cleanup()
-
 
 def set_searchspace_and_run_trial(search_space, config, args):
     import ray
@@ -1066,25 +1060,21 @@ def run_hpo(config, args):
     from raytune.pt_search_space import raytune_num_samples, search_space
     from raytune.utils import get_raytune_schedule, get_raytune_search_alg
 
-    # create ray cache for intermediate storage of trials
-    tmp_ray_cache = TemporaryDirectory()
-    os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name
-    _logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}")
+    if args.raytune_num_samples:
+        raytune_num_samples = args.raytune_num_samples  # noqa: F811
 
     name = args.hpo  # name of Ray Tune experiment directory
 
     os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"  # don't crash if a metric is missing
     if isinstance(config["raytune"]["local_dir"], type(None)):
         raise TypeError("Please specify a local_dir in the raytune section of the config file.")
-    trd = config["raytune"]["local_dir"] + "/tune_result_dir"
-    os.environ["TUNE_RESULT_DIR"] = trd
 
     expdir = Path(config["raytune"]["local_dir"]) / name
     expdir.mkdir(parents=True, exist_ok=True)
     dirname = Path(config["raytune"]["local_dir"]) / name
     shutil.copy(
-        "mlpf/raytune/search_space.py",
-        str(dirname / "search_space.py"),
+        "mlpf/raytune/pt_search_space.py",
+        str(dirname / "pt_search_space.py"),
     )  # Copy the search space definition file to the train dir for later reference
     # Save config for later reference. Note that saving happens after parameters are overwritten by cmd line args.
     with open((dirname / "config.yaml"), "w") as file:
@@ -1095,7 +1085,6 @@ def run_hpo(config, args):
         ray.init(
             address=os.environ["ip_head"],
             _node_ip_address=os.environ["head_node_ip"],
-            # _temp_dir="/p/project/raise-ctp2/cern/tmp_ray",
         )
         _logger.info("Done.")
 
@@ -1158,6 +1147,3 @@ def run_hpo(config, args):
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
     logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
-
-    # clean up ray cache
-    tmp_ray_cache.cleanup()
diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
@@ -63,6 +63,7 @@
 parser.add_argument("--local", action="store_true", default=None, help="perform HPO locally, without a Ray cluster")
 parser.add_argument("--ray-cpus", type=int, default=None, help="CPUs per trial for HPO")
 parser.add_argument("--ray-gpus", type=int, default=None, help="GPUs per trial for HPO")
+parser.add_argument("--raytune-num-samples", type=int, default=None, help="Number of samples to draw from search space")
 parser.add_argument("--comet", action="store_true", help="use comet ml logging")
 parser.add_argument("--comet-offline", action="store_true", help="save comet logs locally")
 parser.add_argument("--comet-step-freq", type=int, default=None, help="step frequency for saving comet metrics")

diff --git a/mlpf/raytune/pt_search_space.py b/mlpf/raytune/pt_search_space.py
@@ -1,47 +1,44 @@
-from ray.tune import grid_search  # grid_search, choice, loguniform, quniform
+from ray.tune import choice  # grid_search, choice, loguniform, quniform
 
-raytune_num_samples = 1  # Number of random samples to draw from search space. Set to 1 for grid search.
-samp = grid_search
+raytune_num_samples = 400  # Number of random samples to draw from search space. Set to 1 for grid search.
+samp = choice
 
 # gnn scan
 search_space = {
     # dataset parameters
-    "ntrain": samp([500]),
+    # "ntrain": samp([500]),
     # "ntest": samp([10000]),
-    "nvalid": samp([500]),
-    "num_epochs": samp([10]),
+    # "nvalid": samp([500]),
+    # "num_epochs": samp([10]),
     # optimizer parameters
-    "lr": samp([1e-4, 3e-4, 1e-3, 3e-3]),
-    "lr_schedule": samp(["onecycle"]),
-    "pct_start": samp([0.05]),
-    # "gpu_batch_multiplier": samp([1, 4, 8, 16]),
+    "lr": samp([1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]),
+    # "lr_schedule": samp(["onecycle"]),
+    # "pct_start": samp([0.0, 0.05, 0.1]),
+    "gpu_batch_multiplier": samp([1, 4, 8, 16]),
     # "patience": samp([9999]),
     # model arch parameters
-    # "activation": samp(["elu", "relu", "relu6", "leakyrelu"]),
-    "conv_type": samp(["attention"]),  # can be "gnn_lsh", "gravnet", "attention"
+    "activation": samp(["elu", "relu", "relu6", "leakyrelu"]),
+    # "conv_type": samp(["attention"]),  # can be "gnn_lsh", "gravnet", "attention"
     # "embedding_dim": samp([32, 64, 128, 252, 512, 1024]),
     # "width": samp([32, 64, 128, 256, 512, 1024]),
-    # "num_convs": samp([1, 2, 3, 4, 5, 6]),
+    "num_convs": samp([1, 2, 3, 4, 5]),
     # "dropout": samp([0.0, 0.01, 0.1, 0.4]),
-    # only for gravnet
-    # "k": samp([8, 16, 32]),
-    # "propagate_dimensions": samp([8, 16, 32, 64, 128]),
-    # "space_dimensions": samp([4]),
     # only for gnn-lsh
-    # "bin_size": samp([160, 320, 640]),
+    # "bin_size": samp([80, 160, 320, 640]),
     # "max_num_bins": samp([200]),
-    # "distance_dim": samp([16, 32, 64, 128, 256]),
+    # "distance_dim": samp([128]),
     # "layernorm": samp([True, False]),
-    # "num_node_messages": samp([1, 2, 3, 4, 5]),
-    # "ffn_dist_hidden_dim": samp([16, 32, 64, 128, 256]),
-    # "ffn_dist_num_layers": samp([1, 2, 3, 4, 5, 6]),
+    # "num_node_messages": samp([2]),
+    # "ffn_dist_hidden_dim": samp([64]),
+    # "ffn_dist_num_layers": samp([3]),
     # mamba specific parameters
     # "d_state": samp([16]),
     # "d_conv": samp([4]),
     # "expand": samp([2]),
     # "num_heads": samp([2, 4, 6, 8, 10, 12]),
     # attention specifica parameters
-    "num_heads": samp([2, 4, 8, 16]),
+    "num_heads": samp([4, 8, 16, 32, 64]),
+    "head_dim": samp([4, 8, 16, 32, 64]),
     # "attention_type": samp(["flash"]),  # flash, efficient, math
 }
 
@@ -56,31 +53,27 @@ def set_hps_from_search_space(search_space, config):
         conv_type = search_space["conv_type"]
         config["conv_type"] = conv_type
 
-        common_varaible_names = ["embedding_dim", "width", "num_convs", "activation"]
+        common_varaible_names = ["num_convs", "activation"]
         if conv_type == "gnn_lsh" or conv_type == "gravnet" or conv_type == "attention":
             for var in common_varaible_names:
                 if var in search_space.keys():
                     config["model"][conv_type][var] = search_space[var]
 
-        gravnet_variable_names = ["k", "propagate_dimensions", "space_dimensions"]
-        if conv_type == "gravnet":
-            for var in gravnet_variable_names:
-                if var in search_space.keys():
-                    config["model"][conv_type][var] = search_space[var]
-
-        attention_variables = ["num_heads"]
+        attention_variables = ["head_dim", "num_heads"]
         if conv_type == "attention":
             for var in attention_variables:
                 if var in search_space.keys():
                     config["model"][conv_type][var] = search_space[var]
 
-        mamba_variables = ["num_heads", "d_state", "d_conv", "expand"]
+        mamba_variables = ["width", "embedding_dim", "num_heads", "d_state", "d_conv", "expand"]
         if conv_type == "mamba":
             for var in mamba_variables:
                 if var in search_space.keys():
                     config["model"][conv_type][var] = search_space[var]
 
         gnn_lsh_varaible_names = [
+            "width",
+            "embedding_dim",
             "bin_size",
             "max_num_bins",
             "distance_dim",

diff --git a/mlpf/raytune/utils.py b/mlpf/raytune/utils.py
@@ -8,7 +8,6 @@
 from ray.tune.search.bayesopt import BayesOptSearch
 from ray.tune.search.bohb import TuneBOHB
 from ray.tune.search.hyperopt import HyperOptSearch
-from ray.tune.search.skopt import SkOptSearch
 
 # from ray.tune.search.hebo import HEBOSearch # HEBO is not yet supported
 
@@ -50,21 +49,6 @@ def get_raytune_search_alg(raytune_cfg, seeds=False):
             n_initial_points=raytune_cfg["hyperopt"]["n_random_steps"],
             # points_to_evaluate=,
         )
-    if raytune_cfg["search_alg"] == "scikit":
-        print("INFO: Using bayesian optimization from scikit-learn")
-        return SkOptSearch(
-            metric=raytune_cfg["default_metric"],
-            mode=raytune_cfg["default_mode"],
-            convert_to_python=True,
-        )
-    # HEBO is not yet supported
-    # if (raytune_cfg["search_alg"] == "hebo") or (raytune_cfg["search_alg"] == "HEBO"):
-    #     print("Using HEBOSearch")
-    #     return HEBOSearch(
-    #         metric=raytune_cfg["default_metric"],
-    #         mode=raytune_cfg["default_mode"],
-    #         # max_concurrent=8,
-    #     )
     else:
         print("INFO: Not using any Ray Tune search algorithm")
         return None

diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml
@@ -10,7 +10,7 @@ num_epochs: 100
 patience: 20
 lr: 0.0001
 lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: gnn_lsh
+conv_type: gnn_lsh  # gnn_lsh, attention, mamba, flashattention
 ntrain:
 ntest:
 nvalid:
@@ -80,8 +80,8 @@ lr_schedule_config:
     pct_start: 0.3
 
 raytune:
-  local_dir: # Note: please specify an absolute path
-  sched: asha # asha, hyperband
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
   search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
   default_metric: "val_loss"
   default_mode: "min"

diff --git a/scripts/jureca/build_pip_env.sh b/scripts/jureca/build_pip_env.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# 2023-12-14
+# Author: E. Wulff
+
+
+module --force purge
+ml Stages/2024 GCC/12.3.0 Python/3.11.3
+ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024
+
+jutil env activate -p jureap57
+
+python3 -m venv ray_tune_env
+
+source ray_tune_env/bin/activate
+
+pip3 install --upgrade pip
+pip3 install numpy<1.25
+pip3 install pandas<1.6.0dev0
+pip3 install scikit-learn
+pip3 install matplotlib
+pip3 install tqdm
+pip3 install autopep8
+pip3 install mplhep
+pip3 install awkward
+pip3 install fastjet
+pip3 install comet-ml
+pip3 install tensorflow_datasets==4.9.3
+pip3 install torch torchvision
+pip3 install hls4ml[profiling]
+pip3 install torch_geometric
+pip3 install ray[data,train,tune,serve]
+pip3 install async_timeout
+pip3 install numba
+pip3 install hyperopt
+pip3 install causal-conv1d==1.0.2
+pip3 install mamba-ssm
+pip3 install comet-ml
+
+deactivate
diff --git a/scripts/jureca/pt_raytrain.sh b/scripts/jureca/pt_raytrain.sh
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+#SBATCH --account=jureap57
+#SBATCH --partition=dc-gpu-devel
+#SBATCH --time 2:00:00
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+#SBATCH --gpus-per-task=4
+#SBATCH --cpus-per-task=128
+
+# Job name
+#SBATCH -J raytrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+
+module --force purge
+ml Stages/2024 GCC/12.3.0 Python/3.11.3
+ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024
+
+jutil env activate -p jureap57
+
+source ray_tune_env/bin/activate
+
+echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
+echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
+echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
+echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
+echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
+echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
+echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
+echo "DEBUG: SLURM_NODEID: $SLURM_NODEID"
+echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID"
+echo "DEBUG: SLURM_PROCID: $SLURM_PROCID"
+echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
+echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
+echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+num_gpus=${SLURM_GPUS_PER_TASK}  # gpus per compute node
+export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}  # necessary on JURECA for Ray to work
+
+## Limit number of max pending trials
+export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4))
+
+## Disable Ray Usage Stats
+export RAY_USAGE_STATS_DISABLE=1
+
+
+################# DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
+# if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+
+port=7639
+
+export ip_head="$head_node"i:"$port"
+export head_node_ip="$head_node"i
+
+echo "Starting HEAD at $head_node"
+# apptainer exec --nv -B /p/project/jureap57/cern \
+# apptainer/images/jureca_torch2307.sif \
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node"i --port=$port \
+    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus  --block &
+sleep 20
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$head_node"i:"$port" --redis-password='5241580000000000' \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block &
+    sleep 10
+done
+echo All Ray workers started.
+# fi
+##############################################################################################
+
+echo 'Starting training.'
+# when training with Ray Train, --gpus should be equal to toal number of GPUs across the Ray Cluster
+# apptainer exec --nv -B /p/project/jureap57/cern/data/tensorflow_datasets,/p/project/jureap57/cern/particleflow \
+#  apptainer/images/jureca_torch2307.sif \
+python3 -u $PWD/mlpf/pyg_pipeline.py --train --ray-train \
+    --config $1 \
+    --prefix $2 \
+    --ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
+    --gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
+    --gpu-batch-multiplier 8 \
+    --num-workers 8 \
+    --prefetch-factor 8 \
+    --experiments-dir /p/project/jureap57/cern/particleflow/experiments \
+    --local \
+    --ntrain 50000
+
+echo 'Training done.'