Temp benchmark ci dir (#765)

* Support fork in benchmark CI * use temporary dir for benchmark CI * debug * revert back * dependency fix * refactor script
huggingface · Sep 18, 2023 · 4697523 · 4697523
1 parent 9a8d52c
commit 4697523
Show file tree

Hide file tree

Showing 9 changed files with 69 additions and 36 deletions.
diff --git a/benchmark/benchmark_and_report.sh b/benchmark/benchmark_and_report.sh
@@ -1,10 +1,21 @@
+#### Step 1: create a work directory:
+# this is necessary because another github action job will remove
+# the entire directory, which slurm depends on.
+# https://stackoverflow.com/questions/4632028/how-to-create-a-temporary-directory
+MY_SLURM_TMP_DIR=/fsx/costa/slurm_tmpdir
+mkdir -p $MY_SLURM_TMP_DIR
+WORK_DIR=`mktemp -d -p "$MY_SLURM_TMP_DIR"`
+cp -r "$PWD" "$WORK_DIR"
+cd "$WORK_DIR/$(basename "$PWD")"
+echo WORK_DIR: $WORK_DIR
+
+#### Step 2: actual work starts:
 echo PATH is $PATH
 echo PYTHONPATH is $PYTHONPATH
 echo whcih python is $(which python)
 
 export WANDB_ENTITY=huggingface
-
-bash benchmark/benchmark_core.sh > output.txt
+bash $BENCHMARK_SCRIPT > output.txt
 
 # Extract Job IDs into an array
 job_ids=($(grep "Job ID:" output.txt | awk '{print $3}'))
@@ -27,4 +38,4 @@ done
 echo "TAGS_STRING: $TAGS_STRING"
 echo "FOLDER_STRING: $FOLDER_STRING"
 
-TAGS_STRING=$TAGS_STRING FOLDER_STRING=$FOLDER_STRING sbatch --dependency=afterany:$job_ids benchmark/post_github_comment.sbatch
+TAGS_STRING=$TAGS_STRING FOLDER_STRING=$FOLDER_STRING BENCHMARK_PLOT_SCRIPT=$BENCHMARK_PLOT_SCRIPT sbatch --dependency=afterany:$job_ids benchmark/post_github_comment.sbatch
diff --git a/benchmark/benchmark_core.sh b/benchmark/benchmark_core.sh
diff --git a/benchmark/benchmark_level1.sh b/benchmark/benchmark_level1.sh
@@ -0,0 +1,11 @@
+# hello world experiment
+python benchmark/benchmark.py \
+    --command "python examples/scripts/sentiment_tuning.py --ppo_config.log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
diff --git a/benchmark/plot_core.sh → benchmark/benchmark_level1_plot.sh b/benchmark/plot_core.sh → benchmark/benchmark_level1_plot.sh
@@ -2,8 +2,6 @@
 # see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
 echo "we deal with $TAGS_STRING"
 
-        # "sentiment_tuning_gpt2xl_grad_accu$TAGS_STRING" \
-
 python -m openrlbenchmark.rlops_multi_metrics \
     --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
         "sentiment_tuning$TAGS_STRING" \
@@ -14,7 +12,6 @@ python -m openrlbenchmark.rlops_multi_metrics \
     --output-filename benchmark/trl/$FOLDER_STRING/different_models \
     --scan-history
 
-
 python benchmark/upload_benchmark.py \
     --folder_path="benchmark/trl/$FOLDER_STRING" \
     --path_in_repo="images/benchmark/$FOLDER_STRING" \

diff --git a/benchmark/benchmark_level2.sh b/benchmark/benchmark_level2.sh
@@ -0,0 +1,11 @@
+# compound
+python benchmark/benchmark.py \
+    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_gpt2xl_grad_accu --ppo_config.model_name gpt2-xl --ppo_config.mini_batch_size 16 --ppo_config.gradient_accumulation_steps 8 --ppo_config.log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
diff --git a/benchmark/benchmark_level2_plot.sh b/benchmark/benchmark_level2_plot.sh
@@ -0,0 +1,21 @@
+# pip install openrlbenchmark==0.2.1a5
+# see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
+echo "we deal with $TAGS_STRING"
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "sentiment_tuning$TAGS_STRING" \
+        "sentiment_tuning_gpt2xl_grad_accu$TAGS_STRING" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/different_models \
+    --scan-history
+
+python benchmark/upload_benchmark.py \
+    --folder_path="benchmark/trl/$FOLDER_STRING" \
+    --path_in_repo="images/benchmark/$FOLDER_STRING" \
+    --repo_id="trl-internal-testing/example-images" \
+    --repo_type="dataset"
+
diff --git a/benchmark/benchmark_aux.sh → benchmark/benchmark_level3.sh b/benchmark/benchmark_aux.sh → benchmark/benchmark_level3.sh
@@ -1,7 +1,7 @@
 ## w/ and w/o gradient accumulation
 python benchmark/benchmark.py \
     --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_step_grad_accu --ppo_config.mini_batch_size 1 --ppo_config.gradient_accumulation_steps 128 --ppo_config.log_with wandb" \
-    --num-seeds 5 \
+    --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
     --slurm-nodes 1 \
@@ -13,7 +13,7 @@ python benchmark/benchmark.py \
 ## w/ different models (gpt2, gpt2-xl, falcon, llama2)
 python benchmark/benchmark.py \
     --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_gpt2 --ppo_config.log_with wandb" \
-    --num-seeds 5 \
+    --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
     --slurm-nodes 1 \
@@ -23,7 +23,7 @@ python benchmark/benchmark.py \
     --slurm-template-path benchmark/trl.slurm_template
 python benchmark/benchmark.py \
     --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_falcon_rw_1b --ppo_config.model_name tiiuae/falcon-rw-1b --ppo_config.log_with wandb" \
-    --num-seeds 5 \
+    --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
     --slurm-nodes 1 \
@@ -36,7 +36,7 @@ python benchmark/benchmark.py \
 ## w/ and w/o PEFT
 python benchmark/benchmark.py \
     --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_peft --use_peft --ppo_config.log_with wandb" \
-    --num-seeds 5 \
+    --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
     --slurm-nodes 1 \

diff --git a/benchmark/post_github_comment.sbatch b/benchmark/post_github_comment.sbatch
@@ -4,6 +4,12 @@
 #SBATCH --ntasks=1
 #SBATCH --output=slurm/logs/%x_%j.out
 
+cleanup () {
+    rm -rf "$WORK_DIR"
+    echo "Deleted temp working directory $WORK_DIR"
+}
+trap cleanup EXIT
+
 sleep 2m
-bash benchmark/plot_core.sh
+bash $BENCHMARK_PLOT_SCRIPT
 srun python benchmark/post_github_comment.py
diff --git a/setup.py b/setup.py
@@ -73,7 +73,7 @@
     "diffusers": ["diffusers>=0.18.0"],
     "deepspeed": ["deepspeed>=0.9.5"],
     "dev": ["parameterized", "pytest", "pytest-xdist", "pre-commit", "peft>=0.4.0", "diffusers>=0.18.0"],
-    "benchmark": ["wandb", "ghapi", "openrlbenchmark==0.2.1a5"],
+    "benchmark": ["wandb", "ghapi", "openrlbenchmark==0.2.1a5", "requests"],
 }
 
 setup(