forked from jpata/particleflow
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix use of deprecated Ray Tune environment variable (jpata#338)
* chore: update raytune search space, utils and startscript * fix: raytune deprecated env var for storage_path Also add num samples to draw in HPO as cmd line arg * chore: update clic config file for jureap57 * feat: script to build python env from scratch * chore: update startscripts for raytrain and raytune * fix CMS model path for ACAT2022 * MLPF datasets v2.0.0: track pythia-level genjets, genmet in datasets; add per-particle ispu flag (jpata#332) * generate ttbar nopu events * up * update postprocessing * small sample generation * v3_1 run * updates for CMSSE 14 generation * [skip ci] cleanup postprocessing * [skip ci] update pu gen * update postprocessing with new truth definition based only on caloparticles * remove pdb, switch genjet to energy * [skip ci] prepare for v3_3 * [skip ci] fix flag * added time and mem limits * pu files from scratch * 20240702_cptruthdef submission * ttbar nopu v2 * up * added genjet, genmet to clic postprocessing * remove delphes * update tests * add postprocessing jobs * update torch * update dataset version * propagate genjets, genmet * shared memory error * training on v2.0.0 for cms * fix occasional root file load bug * add jmenano * fix qq * clic training * up * CMS training instructions (jpata#336) * CMS training instructions * Update pyg-clic.yaml * Update pyg-clic.yaml * fix: black formatting * Enable CI/CD test of HPO workflow * fix: typo in test script --------- Co-authored-by: Joosep Pata <joosep.pata@gmail.com>
- Loading branch information
Showing
9 changed files
with
290 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
|
||
# 2023-12-14 | ||
# Author: E. Wulff | ||
|
||
|
||
module --force purge | ||
ml Stages/2024 GCC/12.3.0 Python/3.11.3 | ||
ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024 | ||
|
||
jutil env activate -p jureap57 | ||
|
||
python3 -m venv ray_tune_env | ||
|
||
source ray_tune_env/bin/activate | ||
|
||
pip3 install --upgrade pip | ||
pip3 install numpy<1.25 | ||
pip3 install pandas<1.6.0dev0 | ||
pip3 install scikit-learn | ||
pip3 install matplotlib | ||
pip3 install tqdm | ||
pip3 install autopep8 | ||
pip3 install mplhep | ||
pip3 install awkward | ||
pip3 install fastjet | ||
pip3 install comet-ml | ||
pip3 install tensorflow_datasets==4.9.3 | ||
pip3 install torch torchvision | ||
pip3 install hls4ml[profiling] | ||
pip3 install torch_geometric | ||
pip3 install ray[data,train,tune,serve] | ||
pip3 install async_timeout | ||
pip3 install numba | ||
pip3 install hyperopt | ||
pip3 install causal-conv1d==1.0.2 | ||
pip3 install mamba-ssm | ||
pip3 install comet-ml | ||
|
||
deactivate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/bin/sh | ||
|
||
#SBATCH --account=jureap57 | ||
#SBATCH --partition=dc-gpu-devel | ||
#SBATCH --time 2:00:00 | ||
#SBATCH --nodes 1 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --cpus-per-task=128 | ||
|
||
# Job name | ||
#SBATCH -J raytrain | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
|
||
module --force purge | ||
ml Stages/2024 GCC/12.3.0 Python/3.11.3 | ||
ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024 | ||
|
||
jutil env activate -p jureap57 | ||
|
||
source ray_tune_env/bin/activate | ||
|
||
echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" | ||
echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" | ||
echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" | ||
echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" | ||
echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" | ||
echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" | ||
echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" | ||
echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" | ||
echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" | ||
echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" | ||
echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" | ||
echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" | ||
echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK" | ||
echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK" | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||
num_gpus=${SLURM_GPUS_PER_TASK} # gpus per compute node | ||
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work | ||
|
||
## Limit number of max pending trials | ||
export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) | ||
|
||
## Disable Ray Usage Stats | ||
export RAY_USAGE_STATS_DISABLE=1 | ||
|
||
|
||
################# DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### | ||
# if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then | ||
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") | ||
nodes_array=($nodes) | ||
|
||
head_node=${nodes_array[0]} | ||
|
||
port=7639 | ||
|
||
export ip_head="$head_node"i:"$port" | ||
export head_node_ip="$head_node"i | ||
|
||
echo "Starting HEAD at $head_node" | ||
# apptainer exec --nv -B /p/project/jureap57/cern \ | ||
# apptainer/images/jureca_torch2307.sif \ | ||
srun --nodes=1 --ntasks=1 -w "$head_node" \ | ||
ray start --head --node-ip-address="$head_node"i --port=$port \ | ||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & | ||
sleep 20 | ||
|
||
# number of nodes other than the head node | ||
worker_num=$((SLURM_JOB_NUM_NODES - 1)) | ||
for ((i = 1; i <= worker_num; i++)); do | ||
node_i=${nodes_array[$i]} | ||
echo "Starting WORKER $i at $node_i" | ||
srun --nodes=1 --ntasks=1 -w "$node_i" \ | ||
ray start --address "$head_node"i:"$port" --redis-password='5241580000000000' \ | ||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & | ||
sleep 10 | ||
done | ||
echo All Ray workers started. | ||
# fi | ||
############################################################################################## | ||
|
||
echo 'Starting training.' | ||
# when training with Ray Train, --gpus should be equal to toal number of GPUs across the Ray Cluster | ||
# apptainer exec --nv -B /p/project/jureap57/cern/data/tensorflow_datasets,/p/project/jureap57/cern/particleflow \ | ||
# apptainer/images/jureca_torch2307.sif \ | ||
python3 -u $PWD/mlpf/pyg_pipeline.py --train --ray-train \ | ||
--config $1 \ | ||
--prefix $2 \ | ||
--ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \ | ||
--gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \ | ||
--gpu-batch-multiplier 8 \ | ||
--num-workers 8 \ | ||
--prefetch-factor 8 \ | ||
--experiments-dir /p/project/jureap57/cern/particleflow/experiments \ | ||
--local \ | ||
--ntrain 50000 | ||
|
||
echo 'Training done.' |
Oops, something went wrong.