huggingface · lewtun · Oct 11, 2023 · Oct 7, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/benchmark/benchmark_level1.sh b/benchmark/benchmark_level1.sh
@@ -1,6 +1,6 @@
 # hello world experiment
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \

diff --git a/benchmark/benchmark_level1_plot.sh b/benchmark/benchmark_level1_plot.sh
@@ -4,7 +4,7 @@ echo "we deal with $TAGS_STRING"
 
 python -m openrlbenchmark.rlops_multi_metrics \
     --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
-        "sentiment_tuning$TAGS_STRING" \
+        "run_ppo$TAGS_STRING" \
     --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
     --no-check-empty-runs \
     --pc.ncols 2 \

diff --git a/benchmark/benchmark_level2.sh b/benchmark/benchmark_level2.sh
@@ -1,6 +1,6 @@
 # compound experiments: gpt2xl + grad_accu
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_gpt2xl_grad_accu --ppo_config.model_name gpt2-xl --ppo_config.mini_batch_size 16 --ppo_config.gradient_accumulation_steps 8 --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_gpt2xl_grad_accu --ppo_config.model_name gpt2-xl --ppo_config.mini_batch_size 16 --ppo_config.gradient_accumulation_steps 8 --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
@@ -12,7 +12,7 @@ python benchmark/benchmark.py \
 
 # compound experiments: Cerebras-GPT-6.7B + deepspeed zero2 + grad_accu
 python benchmark/benchmark.py \
-    --command "accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2  --ppo_config.batch_size 32  --ppo_config.mini_batch_size 32 --ppo_config.log_with wandb --ppo_config.model_name cerebras/Cerebras-GPT-6.7B --ppo_config.reward_model sentiment-analysis:cerebras/Cerebras-GPT-6.7B" \
+    --command "accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2  --ppo_config.batch_size 32  --ppo_config.mini_batch_size 32 --ppo_config.log_with wandb --ppo_config.model_name cerebras/Cerebras-GPT-6.7B --ppo_config.reward_model sentiment-analysis:cerebras/Cerebras-GPT-6.7B" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \

diff --git a/benchmark/benchmark_level2_plot.sh b/benchmark/benchmark_level2_plot.sh
@@ -4,8 +4,8 @@ echo "we deal with $TAGS_STRING"
 
 python -m openrlbenchmark.rlops_multi_metrics \
     --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
-        "sentiment_tuning$TAGS_STRING" \
-        "sentiment_tuning_gpt2xl_grad_accu$TAGS_STRING" \
+        "run_ppo$TAGS_STRING" \
+        "run_ppo_gpt2xl_grad_accu$TAGS_STRING" \
     --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
     --no-check-empty-runs \
     --pc.ncols 2 \
@@ -15,7 +15,7 @@ python -m openrlbenchmark.rlops_multi_metrics \
 
 python -m openrlbenchmark.rlops_multi_metrics \
     --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
-        "sentiment_tuning_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2$TAGS_STRING" \
+        "run_ppo_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2$TAGS_STRING" \
     --env-ids sentiment-analysis:cerebras/Cerebras-GPT-6.7B \
     --no-check-empty-runs \
     --pc.ncols 2 \

diff --git a/benchmark/benchmark_level3.sh b/benchmark/benchmark_level3.sh
@@ -1,6 +1,6 @@
 ## w/ and w/o gradient accumulation
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_step_grad_accu --ppo_config.mini_batch_size 1 --ppo_config.gradient_accumulation_steps 128 --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_step_grad_accu --ppo_config.mini_batch_size 1 --ppo_config.gradient_accumulation_steps 128 --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
@@ -12,7 +12,7 @@ python benchmark/benchmark.py \
 
 ## w/ different models (gpt2, gpt2-xl, falcon, llama2)
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_gpt2 --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_gpt2 --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
@@ -22,7 +22,7 @@ python benchmark/benchmark.py \
     --slurm-total-cpus 12 \
     --slurm-template-path benchmark/trl.slurm_template
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_falcon_rw_1b --ppo_config.model_name tiiuae/falcon-rw-1b --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_falcon_rw_1b --ppo_config.model_name tiiuae/falcon-rw-1b --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \
@@ -35,7 +35,7 @@ python benchmark/benchmark.py \
 
 ## w/ and w/o PEFT
 python benchmark/benchmark.py \
-    --command "python examples/scripts/sentiment_tuning.py --ppo_config.exp_name sentiment_tuning_peft --use_peft --ppo_config.log_with wandb" \
+    --command "python examples/scripts/run_ppo.py --ppo_config.exp_name run_ppo_peft --use_peft --ppo_config.log_with wandb" \
     --num-seeds 3 \
     --start-seed 1 \
     --workers 10 \

diff --git a/docs/source/customization.mdx b/docs/source/customization.mdx
@@ -32,7 +32,7 @@ All of the trainers in TRL can be run on multiple GPUs together with DeepSpeed Z
 accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_your_script.py --all_arguments_of_the_script
 ```
 
-Note that for ZeRO-3, a small tweak is needed to initialize your reward model on the correct device via the `zero3_init_context_manager()` context manager. In particular, this is needed to avoid DeepSpeed hanging after a fixed number of training steps. Here is a snippet of what is involved from the [`sentiment_tuning`](https://github.com/huggingface/trl/blob/main/examples/scripts/sentiment_tuning.py) example:
+Note that for ZeRO-3, a small tweak is needed to initialize your reward model on the correct device via the `zero3_init_context_manager()` context manager. In particular, this is needed to avoid DeepSpeed hanging after a fixed number of training steps. Here is a snippet of what is involved from the [`sentiment_tuning`](https://github.com/huggingface/trl/blob/main/examples/scripts/run_ppo.py) example:
 
 ```python
 ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
@@ -210,7 +210,7 @@ ppo_config = {
 config = PPOConfig(**ppo_config)
 ```
 
-To run `sentiment_tuning.py`, you can use the following command:
+To run `run_ppo.py`, you can use the following command:
 ```
-python examples/scripts/sentiment_tuning.py --log_with wandb --use_score_scaling --use_score_norm --score_clip 0.5
+python examples/scripts/run_ppo.py --log_with wandb --use_score_scaling --use_score_norm --score_clip 0.5
 ```
diff --git a/docs/source/ddpo_trainer.mdx b/docs/source/ddpo_trainer.mdx
@@ -25,16 +25,16 @@ Note that the default implementation has a LoRA implementation path and a non-Lo
 
 Also in addition, there is the expectation of providing a reward function and a prompt function. The reward function is used to evaluate the generated images  and the prompt function is used to generate the prompts that are used to generate the images.
 
-## Getting started with `examples/scripts/stable_diffusion_tuning.py`
+## Getting started with `examples/scripts/run_ddpo.py`
 
-The `stable_diffusion_tuning.py` script is a working example of using the `DDPO` trainer to finetune a Stable Diffusion model. This example explicitly configures a small subset of the overall parameters associated with the config object (`DDPOConfig`).
+The `run_ddpo.py` script is a working example of using the `DDPO` trainer to finetune a Stable Diffusion model. This example explicitly configures a small subset of the overall parameters associated with the config object (`DDPOConfig`).
 
 **Note:** one A100 GPU is recommended to get this running. Anything below a A100 will not be able to run this example script and even if it does via relatively smaller sized parameters, the results will most likely be poor.
 
 Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post finetuning to HuggingFace hub. The following bash command is to be entered to get things running
 
 ```batch
-python stable_diffusion_tuning.py --hf_user_access_token <token>
+python run_ddpo.py --hf_user_access_token <token>
 ```
 
 To obtain the documentation of `stable_diffusion_tuning.py`, please run `python stable_diffusion_tuning.py --help`

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
@@ -1,6 +1,6 @@
 # DPO Trainer
 
-TRL supports the DPO Trainer for training language models from preference data, as described in the paper [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) by Rafailov et al., 2023. For a full example have a look at  [`examples/dpo.py`](https://github.com/huggingface/trl/blob/main/examples/dpo.py).
+TRL supports the DPO Trainer for training language models from preference data, as described in the paper [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) by Rafailov et al., 2023. For a full example have a look at  [`examples/run_dpo.py`](https://github.com/huggingface/trl/blob/main/examples/run_dpo.py).
 
 
 The first step as always is to train your SFT model, to ensure the data we train on is in-distribution for the DPO algorithm.
@@ -60,7 +60,7 @@ The DPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that
 
 ## Using the `DPOTrainer`
 
-For a detailed example have a look at the `examples/dpo.py` script. At a high level we need to initialize the `DPOTrainer` with a `model` we wish to train, a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response, the `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
+For a detailed example have a look at the `examples/run_dpo.py` script. At a high level we need to initialize the `DPOTrainer` with a `model` we wish to train, a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response, the `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
 
 ```py
  dpo_trainer = DPOTrainer(

diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
@@ -34,10 +34,10 @@ Then, it is encouraged to launch jobs with `accelerate launch`!
 
 | File                                                                                           | Description                                                                                                              |
 |------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
-| [`examples/scripts/sft_trainer.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py) | This script shows how to use the `SFTTrainer` to fine tune a model or adapters into a target dataset.                     |
-| [`examples/scripts/reward_trainer.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_trainer.py) | This script shows how to use the `RewardTrainer` to train a reward model on your own dataset.                            |
-| [`examples/scripts/sentiment_tuning.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sentiment_tuning.py) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
-| [`examples/scripts/multi_adapter_rl.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/multi_adapter_rl.py) | This script shows how to use the `PPOTrainer` to train a single base model with multiple adapters. Requires you to run the example script with the reward model training beforehand. |
+| [`examples/scripts/run_sft.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/run_sft.pyThis script shows how to use the `SFTTrainer` to fine tune a model or adapters into a target dataset.                     |
+| [`examples/scripts/run_reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/run_reward_modeling.py) | This script shows how to use the `RewardTrainer` to train a reward model on your own dataset.                            |
+| [`examples/scripts/run_ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/run_ppo.py) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
+| [`examples/scripts/run_ppo_multi_adapter.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/run_ppo_multi_adapter.py) | This script shows how to use the `PPOTrainer` to train a single base model with multiple adapters. Requires you to run the example script with the reward model training beforehand. |
 | [`examples/scripts/stable_diffusion_tuning_example.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/stable_diffusion_tuning_example.py) | This script shows to use DDPOTrainer to fine-tune a stable diffusion model using reinforcement learning.                 |
 
 Here are also some easier-to-run colab notebooks that you can use to get started with TRL:

diff --git a/docs/source/lora_tuning_peft.mdx b/docs/source/lora_tuning_peft.mdx
@@ -140,5 +140,5 @@ python PATH_TO_SCRIPT
 You can easily fine-tune Llama2 model using `SFTTrainer` and the official script! For example to fine-tune llama2-7b on the Guanaco dataset, run (tested on a single NVIDIA T4-16GB):
 
 ```bash
-python examples/scripts/sft_trainer.py --model_name meta-llama/Llama-2-7b-hf --dataset_name timdettmers/openassistant-guanaco --load_in_4bit --use_peft --batch_size 4 --gradient_accumulation_steps 2
+python examples/scripts/run_sft.py --model_name meta-llama/Llama-2-7b-hf --dataset_name timdettmers/openassistant-guanaco --load_in_4bit --use_peft --batch_size 4 --gradient_accumulation_steps 2
 ```
diff --git a/docs/source/multi_adapter_rl.mdx b/docs/source/multi_adapter_rl.mdx
@@ -11,7 +11,7 @@ You just need to install `peft` and optionally install `bitsandbytes` as well if
 You need to address this approach in three stages that we summarize as follows:
 
 1- Train a base model on the target domain (e.g. `imdb` dataset) - this is the Supervised Fine Tuning stage - it can leverage the `SFTTrainer` from TRL.
-2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_trainer.py)
+2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/run_reward_modeling.py)
 3- Fine tune new adapters on the base model using PPO and the reward adapter. ("0 abstraction RL")
 
 Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3. 

diff --git a/docs/source/reward_trainer.mdx b/docs/source/reward_trainer.mdx
@@ -2,7 +2,7 @@
 
 TRL supports custom reward modeling for anyone to perform reward modeling on their dataset and model.
 
-Check out a complete flexible example inside [`examples/scripts`](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_trainer.py) folder.
+Check out a complete flexible example inside [`examples/scripts`](https://github.com/huggingface/trl/tree/main/examples/scripts/run_reward_modeling.py) folder.
 
 ## Expected dataset format