pytorch · SalmanMohammadi · Aug 5, 2024 · May 9, 2024 · May 10, 2024 · May 12, 2024
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -81,6 +81,7 @@ Loss
    :nosignatures:
 
    loss.DPOLoss
+   loss.PPOLoss
 
 
 Vision Transforms
@@ -96,3 +97,17 @@ Functions used for preprocessing images.
     transforms.tile_crop
     transforms.find_supported_resolutions
     transforms.VisionCrossAttentionMask
+
+Reinforcement Learning From Human Feedback (RLHF)
+--------------------------------------------------
+Components for RLHF algorithms like PPO.
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+    rlhf.estimate_advantages
+    rlhf.get_rewards_ppo
+    rlhf.truncate_sequence_at_first_stop_token
+    rlhf.left_padded_collate
+    rlhf.padded_collate_dpo
diff --git a/docs/source/api_ref_utilities.rst b/docs/source/api_ref_utilities.rst
@@ -115,7 +115,6 @@ Utilities for working with data and datasets.
     :nosignatures:
 
     padded_collate
-    padded_collate_dpo
 
 .. _gen_label:
 

diff --git a/recipes/configs/llama2/1B_full_ppo.yaml b/recipes/configs/llama2/1B_full_ppo.yaml
@@ -0,0 +1,182 @@
+# Config for single device RLHF full finetuning using PPO in ppo_full_finetune_single_device.py
+# using a TinyLlama2 1B model.
+#
+# This config uses hyperparameters based on small set of experiments and information
+# available from existing implementations.
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download TinyLlama/TinyLlama_v1.1 --hf-token <HF_TOKEN> --output-dir /tmp/TinyLlama_v1.1
+#   tune download smohammadi/tinyllama_rm_sentiment_1b  --hf-token <HF_TOKEN> --output-dir /tmp/tinyllama_rm_sentiment_1b --ignore-patterns ""
+#
+# You'll also need to ensure that {output_dir} exists beforehand, as checkpoints for policy and value models are saved in sub-folders.
+# To launch on a single device, run the following command from root:
+#   tune run ppo_full_finetune_single_device --config llama2/1B_full_ppo_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run ppo_full_finetune_single_device --config llama2/1B_full_ppo checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path:  /tmp/TinyLlama_v1.1/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.text_completion_dataset
+  source: trl-internal-testing/sentiment-trl-style
+  max_seq_len: null
+  split: train
+  column: prompt
+  add_eos: False
+
+
+# manually constructing 1B models
+policy_model:
+  _component_: torchtune.models.llama2.llama2
+  vocab_size: 32000
+  num_layers: 22
+  num_heads: 32
+  num_kv_heads: 4
+  embed_dim: 2048
+  max_seq_len: 2048
+  intermediate_dim: 5632
+  attn_dropout: 0.0
+  norm_eps: 1e-5
+
+reward_and_value_model:
+  _component_: torchtune.models.llama2.llama2_classifier
+  num_classes: 1
+  vocab_size: 32000
+  num_layers: 22
+  num_heads: 32
+  num_kv_heads: 4
+  embed_dim: 2048
+  max_seq_len: 2048
+  intermediate_dim: 5632
+  attn_dropout: 0.0
+  norm_eps: 1e-5
+
+# checkpointer for the policy model - update this if resuming from checkpoint
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/TinyLlama_v1.1
+  checkpoint_files: [
+      "pytorch_model.bin",
+  ]
+  # this is the only place where you should update `recipe_checkpoint` if resuming training
+  recipe_checkpoint: null
+  output_dir: ${output_dir}/policy
+  model_type: LLAMA2
+
+# this should be setup identically to the policy model checkpointer at the start of training
+# ensure `checkpoint_files` always points to the original policy weights, even if resuming training
+ref_policy_checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/TinyLlama_v1.1
+  checkpoint_files: [
+      "pytorch_model.bin",
+  ]
+  output_dir: ${output_dir}/policy
+  model_type: LLAMA2
+
+# checkpointer for the value model - update this if resuming from checkpoint
+# since this model will be identical to the reward model it's helpful to initialise this
+# from the trained reward model weights
+value_checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir:  /tmp/tinyllama_rm_sentiment_1b
+  # only `checkpoint_files` need to be updated if resuming training
+  checkpoint_files: [
+      "model.safetensors"
+  ]
+  output_dir: ${output_dir}/value
+  model_type: MISTRAL_REWARD
+
+# checkpointer for the reward model, ensure `checkpoint_files`
+# always points to the original reward model weights, even if resuming training
+reward_checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir:  /tmp/tinyllama_rm_sentiment_1b
+  checkpoint_files: [
+      "model.safetensors"
+  ]
+  output_dir: ${output_dir}/value
+  model_type: MISTRAL_REWARD
+
+
+# Training env
+device: cuda
+
+# Training arguments
+batch_size: 256
+num_steps: 100000
+ppo_epochs: 2
+ppo_batch_size: 128
+gradient_accumulation_steps: 8
+
+# Memory management and performance
+compile: True
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-6
+
+optimizer_in_bwd: False
+log_peak_memory_stats: False
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Trajectory generation arguments
+
+# batch size for forward pass during generation
+forward_batch_size: 32
+max_generated_tokens: 58
+temperature: 0.7
+top_k: null
+
+# Reward args
+
+# parameter for penalising generations shorter than `min_response_length`
+min_response_length: 18
+# parameter for penalising generations without a stop token
+penalise_no_eos: True
+# scalar penalty to apply when penalising
+reward_penalty: -3
+
+# tokens to consider as "end of sequence" tokens
+stop_token_ids: [
+  2,  # eos_id
+  29889 # llama2 "." token
+]
+whiten_rewards: False
+# GAE hyperparameters
+gamma: 1
+lmbda: 0.95
+
+# PPO hyperparameters
+loss:
+  _component_: torchtune.modules.loss.PPOLoss
+  epsilon: 0.2
+  value_coeff: 0.1
+  value_clip_range: 0.2
+kl_coeff: 0.01
+
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+
+log_every_n_steps: 1
+
+resume_from_checkpoint: False
+output_dir: /tmp/llama2-1b-ppo-finetune
+seed: null
+shuffle: True