From d493f873262b8d5ca7ad0c2436be25fd2076708e Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 09:44:13 +0000
Subject: [PATCH 01/14] v1

---
 docs/source/_toctree.yml |  6 ++++++
 docs/source/models.mdx   | 11 +++++++++++
 docs/source/trainer.mdx  |  8 ++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 docs/source/models.mdx
 create mode 100644 docs/source/trainer.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7bc9c528d5..08a4034530 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -6,6 +6,12 @@
   - local: installation
     title: Installation
   title: Get started
+- sections:
+  - local: models
+    title: Model Classes
+  - local: trainer
+    title: Trainer Classes
+  title: API
 - sections: 
   - local: sentiment_tuning
     title: Sentiment Tuning
diff --git a/docs/source/models.mdx b/docs/source/models.mdx
new file mode 100644
index 0000000000..cd2cb912b6
--- /dev/null
+++ b/docs/source/models.mdx
@@ -0,0 +1,11 @@
+# Models
+
+TRL supports various model architectures including most used text generative models. 
+
+## AutoModelForCausalLMWithValueHead
+
+[[autodoc]] AutoModelForCausalLMWithValueHead
+
+- forward
+- save_pretrained
+- push_to_hub
\ No newline at end of file
diff --git a/docs/source/trainer.mdx b/docs/source/trainer.mdx
new file mode 100644
index 0000000000..6d1555f84d
--- /dev/null
+++ b/docs/source/trainer.mdx
@@ -0,0 +1,8 @@
+# Trainer
+
+At TRL we plan to release several RLHF algorithms, we started our journey with PPO (Proximal Policy Optimisation) with an implementation that largely follows  the structure introduced in the paper "Fine-Tuning Language Models from Human Preferences" by D. Ziegler et al. [[paper](https://arxiv.org/pdf/1909.08593.pdf), [code](https://github.com/openai/lm-human-preferences)].
+
+## PPOTrainer
+
+[[autodoc]] PPOTrainer
+

From e0ba887729478d9b95cdae33b9e638774fa01682 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 12:39:19 +0000
Subject: [PATCH 02/14] update doc

---
 docs/source/models.mdx            |  9 +++++---
 trl/models/modeling_base.py       | 38 +++++++++++++++----------------
 trl/models/modeling_value_head.py | 32 +++++++++++++++++++++-----
 3 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/docs/source/models.mdx b/docs/source/models.mdx
index cd2cb912b6..dee9e460db 100644
--- a/docs/source/models.mdx
+++ b/docs/source/models.mdx
@@ -2,10 +2,13 @@
 
 TRL supports various model architectures including most used text generative models. 
 
+## PreTrainedModelWrapper
+
+[[autodoc]] PreTrainedModelWrapper
+
 ## AutoModelForCausalLMWithValueHead
 
+
 [[autodoc]] AutoModelForCausalLMWithValueHead
 
-- forward
-- save_pretrained
-- push_to_hub
\ No newline at end of file
+- _init_weights
\ No newline at end of file
diff --git a/trl/models/modeling_base.py b/trl/models/modeling_base.py
index 6eb0d76f5c..7d4a7042cc 100644
--- a/trl/models/modeling_base.py
+++ b/trl/models/modeling_base.py
@@ -26,14 +26,13 @@ class PreTrainedModelWrapper(nn.Module):
     (`~transformers.PreTrained`) class in order to keep some attributes and methods of the
     (`~transformers.PreTrainedModel`) class.
 
-    Attributes
-    ----------
-    pretrained_model: (`transformers.PreTrainedModel`)
-        The model to be wrapped.
-    parent_class: (`transformers.PreTrainedModel`)
-        The parent class of the model to be wrapped.
-    supported_args: (`list`)
-        The list of arguments that are supported by the wrapper class.
+    Attributes:
+        pretrained_model: (`transformers.PreTrainedModel`)
+            The model to be wrapped.
+        parent_class: (`transformers.PreTrainedModel`)
+            The parent class of the model to be wrapped.
+        supported_args: (`list`)
+            The list of arguments that are supported by the wrapper class.
     """
     transformers_parent_class = None
     supported_args = None
@@ -47,18 +46,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r"""
         Instantiates a new model from a pretrained model.
 
-        Parameters
-        ----------
-        pretrained_model_name_or_path: (`str` or `transformers.PreTrainedModel`)
-            The path to the pretrained model or its name.
-        *model_args:
-            Additional positional arguments passed along to the underlying model's
-            `from_pretrained` method.
-        **kwargs:
-            Additional keyword arguments passed along to the underlying model's
-            `from_pretrained` method. We also pre-process the kwargs to extract
-            the arguments that are specific to the `transformers.PreTrainedModel`
-            class and the arguments that are specific to trl models.
+        Args:
+            pretrained_model_name_or_path: (`str` or `transformers.PreTrainedModel`)
+                The path to the pretrained model or its name.
+            *model_args (`list`, *optional*)):
+                Additional positional arguments passed along to the underlying model's
+                `from_pretrained` method.
+            **kwargs (`dict`, *optional*):
+                Additional keyword arguments passed along to the underlying model's
+                `from_pretrained` method. We also pre-process the kwargs to extract
+                the arguments that are specific to the `transformers.PreTrainedModel`
+                class and the arguments that are specific to trl models.
         """
         if kwargs is not None:
             trl_model_args, pretrained_kwargs = cls._split_kwargs(kwargs)
diff --git a/trl/models/modeling_value_head.py b/trl/models/modeling_value_head.py
index f90f863025..ca9936ef7d 100644
--- a/trl/models/modeling_value_head.py
+++ b/trl/models/modeling_value_head.py
@@ -59,15 +59,15 @@ class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
     An autoregressive model with a value head in addition to the language model head.
     This class inherits from `~trl.PreTrainedModelWrapper` and wraps a
     `transformers.PreTrainedModel` class. The wrapper class supports classic functions
-    such as `from_pretrained` and `push_to_hub` and also provides some additional
-    functionalities such as `generate`.
+    such as `from_pretrained`, `push_to_hub` and `generate`. To call a method of the wrapped
+    model, simply manipulate the `pretrained_model` attribute of this class.
 
     Args:
         pretrained_model (`transformers.PreTrainedModel`):
             The model to wrap. It should be a causal language model such as GPT2.
             or any model mapped inside the `AutoModelForCausalLM` class.
-        kwargs:
-            Additional keyword arguments passed along to the `ValueHead` class.
+        kwargs (`dict`, `optional`):
+            Additional keyword arguments, that are passed to the `ValueHead` class.
     """
     transformers_parent_class = AutoModelForCausalLM
     lm_head_namings = ["lm_head", "embed_out"]
@@ -90,7 +90,10 @@ def __init__(self, pretrained_model, **kwargs):
 
     def _init_weights(self, **kwargs):
         r"""
-        We initialize the weights of the value head.
+        Initializes the weights of the value head. The default initialization strategy is random.
+        Users can pass a different initialization strategy by passing the `v_head_init_strategy` argument
+        when calling `.from_pretrained`. Supported strategies are:
+        - `normal`: initializes the weights with a normal distribution.
         """
         initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
         # random init by default
@@ -109,6 +112,21 @@ def forward(
         attention_mask=None,
         **kwargs,
     ):
+        r"""
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`):
+                Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+                (see `past_key_values` input) to speed up sequential decoding.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the wrapped model.
+        """
         base_model_output = self.pretrained_model(
             input_ids=input_ids,
             past_key_values=past_key_values,
@@ -127,6 +145,8 @@ def forward(
 
     def generate(self, *args, **kwargs):
         r"""
-        We call `generate` on the wrapped model.
+        A simple wrapper around the `generate` method of the wrapped model.
+        Please refer to the [`generate`](https://huggingface.co/docs/transformers/internal/generation_utils)
+        method of the wrapped model for more information about the supported arguments.
         """
         return self.pretrained_model.generate(*args, **kwargs)

From fe314569dfdef70068438109d613558194f24a29 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 12:40:05 +0000
Subject: [PATCH 03/14] update autodoc

---
 docs/source/models.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models.mdx b/docs/source/models.mdx
index dee9e460db..dbf7137809 100644
--- a/docs/source/models.mdx
+++ b/docs/source/models.mdx
@@ -11,4 +11,4 @@ TRL supports various model architectures including most used text generative mod
 
 [[autodoc]] AutoModelForCausalLMWithValueHead
 
-- _init_weights
\ No newline at end of file
+    - _init_weights
\ No newline at end of file

From 8e68e654ed8ef9280c87ea49aa1d1bdc9d5abe0f Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 12:42:31 +0000
Subject: [PATCH 04/14] update doc

---
 trl/models/modeling_base.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/trl/models/modeling_base.py b/trl/models/modeling_base.py
index 7d4a7042cc..23bc48aa8d 100644
--- a/trl/models/modeling_base.py
+++ b/trl/models/modeling_base.py
@@ -102,13 +102,33 @@ def _split_kwargs(cls, kwargs):
 
     def push_to_hub(self, *args, **kwargs):
         r"""
-        Push the pretrained model to the hub.
+        Push the pretrained model to the hub. This method is a wrapper around
+        `transformers.PreTrainedModel.push_to_hub`. Please refer to the documentation
+        of `transformers.PreTrainedModel.push_to_hub` for more information.
+
+        Args:
+            *args: (`list`, *optional*)
+                Positional arguments passed along to the underlying model's
+                `push_to_hub` method.
+            **kwargs: (`dict`, *optional*)
+                Keyword arguments passed along to the underlying model's
+                `push_to_hub` method.
         """
         return self.pretrained_model.push_to_hub(*args, **kwargs)
 
     def save_pretrained(self, *args, **kwargs):
         r"""
-        Save the pretrained model to a directory.
+        Save the pretrained model to a directory. This method is a wrapper around
+        `transformers.PreTrainedModel.save_pretrained`. Please refer to the documentation
+        of `transformers.PreTrainedModel.save_pretrained` for more information.
+
+        Args:
+            *args: (`list`, *optional*)
+                Positional arguments passed along to the underlying model's
+                `save_pretrained` method.
+            **kwargs: (`dict`, *optional*)
+                Keyword arguments passed along to the underlying model's
+                `save_pretrained` method.
         """
         return self.pretrained_model.save_pretrained(*args, **kwargs)
 

From 64ec38e3d526391f00abd93a2489dd02674bf282 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 12:48:43 +0000
Subject: [PATCH 05/14] more docs and `PreTrainedModelWrapper` in public `init`

---
 trl/__init__.py                   | 2 +-
 trl/models/modeling_value_head.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/trl/__init__.py b/trl/__init__.py
index 7dece2c2e0..2e022e715d 100644
--- a/trl/__init__.py
+++ b/trl/__init__.py
@@ -2,5 +2,5 @@
 
 __version__ = "0.1.1"
 
-from .models import AutoModelForCausalLMWithValueHead, create_reference_model
+from .models import AutoModelForCausalLMWithValueHead, PreTrainedModelWrapper, create_reference_model
 from .trainer import PPOConfig, PPOTrainer
diff --git a/trl/models/modeling_value_head.py b/trl/models/modeling_value_head.py
index ca9936ef7d..2e8b850910 100644
--- a/trl/models/modeling_value_head.py
+++ b/trl/models/modeling_value_head.py
@@ -113,6 +113,7 @@ def forward(
         **kwargs,
     ):
         r"""
+        Applies a forward pass to the wrapped model and returns the logits of the value head.
 
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

From 6ce201b4608607947e7c5aa03ca7ed80c8f2408f Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 12:55:50 +0000
Subject: [PATCH 06/14] update

---
 docs/source/models.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/models.mdx b/docs/source/models.mdx
index dbf7137809..fd2aa0bede 100644
--- a/docs/source/models.mdx
+++ b/docs/source/models.mdx
@@ -10,5 +10,6 @@ TRL supports various model architectures including most used text generative mod
 
 
 [[autodoc]] AutoModelForCausalLMWithValueHead
-
+    - forward
+    - generate
     - _init_weights
\ No newline at end of file

From 90a4acfb3e672857aa0d86434a80b8422fc3e6bd Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 13:09:13 +0000
Subject: [PATCH 07/14] update

---
 docs/source/models.mdx            |  1 +
 trl/models/modeling_base.py       | 10 +++----
 trl/models/modeling_value_head.py | 46 +++++++++++++++++++++++++++----
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/docs/source/models.mdx b/docs/source/models.mdx
index fd2aa0bede..7b0fe54ad8 100644
--- a/docs/source/models.mdx
+++ b/docs/source/models.mdx
@@ -10,6 +10,7 @@ TRL supports various model architectures including most used text generative mod
 
 
 [[autodoc]] AutoModelForCausalLMWithValueHead
+    - __init__
     - forward
     - generate
     - _init_weights
\ No newline at end of file
diff --git a/trl/models/modeling_base.py b/trl/models/modeling_base.py
index 23bc48aa8d..f1f84cd0a4 100644
--- a/trl/models/modeling_base.py
+++ b/trl/models/modeling_base.py
@@ -47,7 +47,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         Instantiates a new model from a pretrained model.
 
         Args:
-            pretrained_model_name_or_path: (`str` or `transformers.PreTrainedModel`)
+            pretrained_model_name_or_path (`str` or `transformers.PreTrainedModel`):
                 The path to the pretrained model or its name.
             *model_args (`list`, *optional*)):
                 Additional positional arguments passed along to the underlying model's
@@ -107,10 +107,10 @@ def push_to_hub(self, *args, **kwargs):
         of `transformers.PreTrainedModel.push_to_hub` for more information.
 
         Args:
-            *args: (`list`, *optional*)
+            *args (`list`, *optional*):
                 Positional arguments passed along to the underlying model's
                 `push_to_hub` method.
-            **kwargs: (`dict`, *optional*)
+            **kwargs (`dict`, *optional*):
                 Keyword arguments passed along to the underlying model's
                 `push_to_hub` method.
         """
@@ -123,10 +123,10 @@ def save_pretrained(self, *args, **kwargs):
         of `transformers.PreTrainedModel.save_pretrained` for more information.
 
         Args:
-            *args: (`list`, *optional*)
+            *args (`list`, *optional*):
                 Positional arguments passed along to the underlying model's
                 `save_pretrained` method.
-            **kwargs: (`dict`, *optional*)
+            **kwargs (`dict`, *optional*):
                 Keyword arguments passed along to the underlying model's
                 `save_pretrained` method.
         """
diff --git a/trl/models/modeling_value_head.py b/trl/models/modeling_value_head.py
index 2e8b850910..0d9be496d5 100644
--- a/trl/models/modeling_value_head.py
+++ b/trl/models/modeling_value_head.py
@@ -62,12 +62,24 @@ class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
     such as `from_pretrained`, `push_to_hub` and `generate`. To call a method of the wrapped
     model, simply manipulate the `pretrained_model` attribute of this class.
 
-    Args:
-        pretrained_model (`transformers.PreTrainedModel`):
-            The model to wrap. It should be a causal language model such as GPT2.
-            or any model mapped inside the `AutoModelForCausalLM` class.
-        kwargs (`dict`, `optional`):
-            Additional keyword arguments, that are passed to the `ValueHead` class.
+    Class attributes:
+        - **transformers_parent_class** (`transformers.PreTrainedModel`) -- The parent class of the wrapped model. This
+            should be set to `transformers.AutoModelForCausalLM` for this class.
+        - **lm_head_namings** (`tuple`) -- A tuple of strings that are used to identify the language model head of the
+            wrapped model. This is set to `("lm_head", "embed_out")` for this class but can be changed for other models
+            in the future
+        - **supported_args** (`tuple`) -- A tuple of strings that are used to identify the arguments that are supported
+            by the `ValueHead` class. Currently the supported args are:
+            - **summary_dropout_prob** (`float`, `optional`, defaults to `None`) -- The dropout probability for the
+                `ValueHead` class.
+            - **v_head_initializer_range** (`float`, `optional`, defaults to `None`) -- The initializer range for the
+                `ValueHead` if a specific initialization strategy is selected.
+            - **v_head_init_strategy** (`str`, `optional`, defaults to `None`) -- The initialization strategy for the
+                `ValueHead`. Currently supported strategies are:
+                - **"random"** -- Initializes the weights of the `ValueHead` with a random distribution. This is the default
+                    strategy.
+                - **"normal"** -- Initializes the weights of the `ValueHead` with a normal distribution.
+        
     """
     transformers_parent_class = AutoModelForCausalLM
     lm_head_namings = ["lm_head", "embed_out"]
@@ -78,6 +90,16 @@ class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
     )
 
     def __init__(self, pretrained_model, **kwargs):
+        r"""
+        Initializes the model.
+
+        Args:
+            pretrained_model (`transformers.PreTrainedModel`):
+                The model to wrap. It should be a causal language model such as GPT2.
+                or any model mapped inside the `AutoModelForCausalLM` class.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the `ValueHead` class.
+        """
         super().__init__(pretrained_model)
         v_head_kwargs, _ = self._split_kwargs(kwargs)
 
@@ -94,6 +116,12 @@ def _init_weights(self, **kwargs):
         Users can pass a different initialization strategy by passing the `v_head_init_strategy` argument
         when calling `.from_pretrained`. Supported strategies are:
         - `normal`: initializes the weights with a normal distribution.
+
+        Args:
+            **kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the `ValueHead` class. These arguments
+                can contain the `v_head_init_strategy` argument as well as the `v_head_initializer_range`
+                argument.
         """
         initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
         # random init by default
@@ -149,5 +177,11 @@ def generate(self, *args, **kwargs):
         A simple wrapper around the `generate` method of the wrapped model.
         Please refer to the [`generate`](https://huggingface.co/docs/transformers/internal/generation_utils)
         method of the wrapped model for more information about the supported arguments.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed to the `generate` method of the wrapped model.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed to the `generate` method of the wrapped model.
         """
         return self.pretrained_model.generate(*args, **kwargs)

From 1b2f59ff9bd576741bce19118d0354e5bd5da637 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 13:54:41 +0000
Subject: [PATCH 08/14] update

---
 trl/models/modeling_value_head.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/trl/models/modeling_value_head.py b/trl/models/modeling_value_head.py
index 0d9be496d5..804ecbe305 100644
--- a/trl/models/modeling_value_head.py
+++ b/trl/models/modeling_value_head.py
@@ -72,14 +72,14 @@ class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
             by the `ValueHead` class. Currently the supported args are:
             - **summary_dropout_prob** (`float`, `optional`, defaults to `None`) -- The dropout probability for the
                 `ValueHead` class.
-            - **v_head_initializer_range** (`float`, `optional`, defaults to `None`) -- The initializer range for the
+            - **v_head_initializer_range** (`float`, `optional`, defaults to `0.2`) -- The initializer range for the
                 `ValueHead` if a specific initialization strategy is selected.
             - **v_head_init_strategy** (`str`, `optional`, defaults to `None`) -- The initialization strategy for the
                 `ValueHead`. Currently supported strategies are:
-                - **"random"** -- Initializes the weights of the `ValueHead` with a random distribution. This is the default
+                - **`None`** -- Initializes the weights of the `ValueHead` with a random distribution. This is the default
                     strategy.
                 - **"normal"** -- Initializes the weights of the `ValueHead` with a normal distribution.
-        
+
     """
     transformers_parent_class = AutoModelForCausalLM
     lm_head_namings = ["lm_head", "embed_out"]

From 2304bfb18ee3ce17552fa6d1c2f78a29c3e1c179 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 14:21:08 +0000
Subject: [PATCH 09/14] update docs

---
 trl/trainer/ppo_trainer.py | 62 +++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 05a2837e74..00a53d5e0c 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -37,17 +37,43 @@
     whiten,
 )
 from ..models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model
-from . import AdaptiveKLController, BaseTrainer, FixedKLController
+from . import AdaptiveKLController, BaseTrainer, FixedKLController, PPOConfig
 
 
 class PPOTrainer(BaseTrainer):
     """
     The PPOTrainer uses Proximal Policy Optimization to optimise language models.
+
+    Class attributes:
+        config (`PPOConfig`):
+            Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
+        model (`PreTrainedModelWrapper`):
+            Model to be optimized, Hugging Face transformer model with a value head. Check the documentation
+            of `PreTrainedModelWrapper` for more details.
+        ref_model (`PreTrainedModelWrapper`, *optional*):
+            Reference model to be used for KL penalty, Hugging Face transformer model with a casual language modelling head.
+            Check the documentation of `PreTrainedModelWrapper` for more details. If no reference model is provided, the
+            trainer will create a reference model with the same architecture as the model to be optimized with shared layers.
+        tokenizer (`Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`):
+            Tokenizer to be used for encoding the data. Check the documentation of `transformers.PreTrainedTokenizer` and
+            `transformers.PreTrainedTokenizerFast` for more details.
+        dataset (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*):
+            PyTorch dataset or Hugging Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided,
+            the dataloader must be created outside the trainer users needs to design their own dataloader and make sure the batch
+            size that is used is the same as the one specified in the configuration object.
+        optimizer (`torch.optim.Optimizer`, *optional*):
+            Optimizer to be used for training. If no optimizer is provided, the trainer will create an Adam optimizer with
+            the learning rate specified in the configuration object.
+        data_collator (DataCollatorForLanguageModeling, *optional*):
+            Data collator to be used for training and passed along the dataloader
+        num_shared_layers (int, *optional*):
+            Number of layers to be shared between the model and the reference model, if no reference model is passed. If no number is provided, all the layers
+            will be shared.
     """
 
     def __init__(
         self,
-        config,
+        config: PPOConfig,
         model: PreTrainedModelWrapper,
         ref_model: PreTrainedModelWrapper,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
@@ -167,8 +193,7 @@ def prepare_dataloader(self, dataset: Union[torch.utils.data.Dataset, Dataset],
                 Data collator function.
 
         Returns:
-            `torch.utils.data.DataLoader`:
-                PyTorch dataloader
+            `torch.utils.data.DataLoader`: PyTorch dataloader
         """
         if isinstance(dataset, Dataset):
             dataset = self._remove_unused_columns(dataset)
@@ -210,7 +235,8 @@ def _remove_unused_columns(self, dataset: "Dataset"):
 
     def generate(self, query_tensor: torch.Tensor, **generation_kwargs):
         """
-        Generate response given query.
+        Generate response given the query tensor. First unwrap the model from the accelerator and then
+        call the `generate` method of the model.
 
         Args:
             query_tensor (`torch.LongTensor`):
@@ -219,8 +245,7 @@ def generate(self, query_tensor: torch.Tensor, **generation_kwargs):
                 Keyword arguments for generation.
 
         Returns:
-            response (`torch.LongTensor`):
-                A tensor of shape (`batch_size`, `gen_len`) containing response tokens.
+            `torch.LongTensor`: A tensor of shape (`batch_size`, `gen_len`) containing response tokens.
         """
         response = self.accelerator.unwrap_model(self.model).generate(
             query_tensor.unsqueeze(dim=0), **generation_kwargs
@@ -248,8 +273,7 @@ def _step_safety_checker(
             scores (List[`torch.FloatTensor`]):
                 List of tensors containing the scores.
         Returns:
-            queries, responses, scores (List[`torch.LongTensor`], List[`torch.LongTensor`], List[`torch.FloatTensor`]):
-                The input processed data.
+            `tuple`: The input processed data.
         """
         for name, tensor_list in zip(["queries", "responses", "scores"], [queries, responses, scores]):
             if not isinstance(tensor_list, list):
@@ -282,7 +306,8 @@ def step(
         scores: List[torch.FloatTensor],
     ):
         """
-        Run a PPO optimisation step.
+        Run a PPO optimisation step given the input data. The input data is first checked for validity
+        and then the forward pass is run.
 
         Args:
             queries (List[`torch.LongTensor`]):
@@ -293,8 +318,7 @@ def step(
                 List of tensors containing the scores.
 
         Returns:
-            train_stats (dict[str, Any]):
-                a summary of the training statistics
+            `dict[str, Any]`: A summary of the training statistics
         """
 
         bs = self.config.batch_size
@@ -370,8 +394,7 @@ def gather_stats(self, stats):
             a dictionary of stats to be gathered. The stats should contain torch tensors.
 
         Returns:
-            stats (dict[str, Any]):
-                a dictionary of stats with the tensors gathered.
+            `dict[str, Any]`: A dictionary of stats with the tensors gathered.
         """
         import torch.distributed as dist
 
@@ -396,13 +419,10 @@ def batched_forward_pass(self, queries: torch.Tensor, responses: torch.Tensor):
                 List of tensors containing the encoded responses, shape (`batch_size`, `response_length`)
 
         Returns:
-            all_logprobs (`torch.FloatTensor`):
-                List of tensors containing the logprobs, shape (`batch_size`, `response_length`)
-            all_ref_logprobs (`torch.FloatTensor`):
-                List of tensors containing the logprobs from the reference model, shape (`batch_size`, `response_length`)
-            all_values (`torch.FloatTensor`):
-                List of tensors containing the output from the value head, shape (`batch_size`, `response_length`)
-
+            (tuple):
+                - all_logprobs (`torch.FloatTensor`): Log probabilities of the responses, shape (`batch_size`, `response_length`)
+                - all_ref_logprobs (`torch.FloatTensor`): Log probabilities of the responses, shape (`batch_size`, `response_length`)
+                - all_values (`torch.FloatTensor`): Values of the responses, shape (`batch_size`, `response_length`)
         """
         bs = self.config.batch_size
         fbs = self.config.forward_batch_size

From 8dd05c9a9888a53223d3ae58d5cd27119e336fbe Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 14:30:19 +0000
Subject: [PATCH 10/14] few fixes

---
 trl/models/modeling_base.py |  7 ++++++-
 trl/trainer/ppo_trainer.py  | 28 ++++++++++------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/trl/models/modeling_base.py b/trl/models/modeling_base.py
index f1f84cd0a4..617e4dd06f 100644
--- a/trl/models/modeling_base.py
+++ b/trl/models/modeling_base.py
@@ -44,7 +44,12 @@ def __init__(self, pretrained_model=None, **kwargs):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r"""
-        Instantiates a new model from a pretrained model.
+        Instantiates a new model from a pretrained model from `transformers`. The
+        pretrained model is loaded using the `from_pretrained` method of the
+        `transformers.PreTrainedModel` class. The arguments that are specific to the
+        `transformers.PreTrainedModel` class are passed along this method and filtered
+        out from the `kwargs` argument.
+
 
         Args:
             pretrained_model_name_or_path (`str` or `transformers.PreTrainedModel`):
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 00a53d5e0c..40beb871cc 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -44,30 +44,22 @@ class PPOTrainer(BaseTrainer):
     """
     The PPOTrainer uses Proximal Policy Optimization to optimise language models.
 
-    Class attributes:
-        config (`PPOConfig`):
-            Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
-        model (`PreTrainedModelWrapper`):
-            Model to be optimized, Hugging Face transformer model with a value head. Check the documentation
-            of `PreTrainedModelWrapper` for more details.
-        ref_model (`PreTrainedModelWrapper`, *optional*):
-            Reference model to be used for KL penalty, Hugging Face transformer model with a casual language modelling head.
+    Attributes:
+        - **config** (`PPOConfig`) -- Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
+        - **model** (`PreTrainedModelWrapper`) -- Model to be optimized, Hugging Face transformer model with a value head.
+            Check the documentation of `PreTrainedModelWrapper` for more details.
+        - **ref_model** (`PreTrainedModelWrapper`, *optional*) -- Reference model to be used for KL penalty, Hugging Face transformer model with a casual language modelling head.
             Check the documentation of `PreTrainedModelWrapper` for more details. If no reference model is provided, the
             trainer will create a reference model with the same architecture as the model to be optimized with shared layers.
-        tokenizer (`Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`):
-            Tokenizer to be used for encoding the data. Check the documentation of `transformers.PreTrainedTokenizer` and
+        - **tokenizer** (`Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`) -- Tokenizer to be used for encoding the data. Check the documentation of `transformers.PreTrainedTokenizer` and
             `transformers.PreTrainedTokenizerFast` for more details.
-        dataset (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*):
-            PyTorch dataset or Hugging Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided,
+        - **dataset** (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*) -- PyTorch dataset or Hugging Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided,
             the dataloader must be created outside the trainer users needs to design their own dataloader and make sure the batch
             size that is used is the same as the one specified in the configuration object.
-        optimizer (`torch.optim.Optimizer`, *optional*):
-            Optimizer to be used for training. If no optimizer is provided, the trainer will create an Adam optimizer with
+        - **optimizer** (`torch.optim.Optimizer`, *optional*) -- Optimizer to be used for training. If no optimizer is provided, the trainer will create an Adam optimizer with
             the learning rate specified in the configuration object.
-        data_collator (DataCollatorForLanguageModeling, *optional*):
-            Data collator to be used for training and passed along the dataloader
-        num_shared_layers (int, *optional*):
-            Number of layers to be shared between the model and the reference model, if no reference model is passed. If no number is provided, all the layers
+        - **data_collator** (DataCollatorForLanguageModeling, *optional*) -- Data collator to be used for training and passed along the dataloader
+        - **num_shared_layers** (int, *optional*) -- Number of layers to be shared between the model and the reference model, if no reference model is passed. If no number is provided, all the layers
             will be shared.
     """
 

From 85d0e55d601390e593344b9299fc0fd264fd4fa8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 17 Jan 2023 16:31:22 +0000
Subject: [PATCH 11/14] add `PPOConfig` to the docs

---
 docs/source/trainer.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/trainer.mdx b/docs/source/trainer.mdx
index 6d1555f84d..0d3ba16668 100644
--- a/docs/source/trainer.mdx
+++ b/docs/source/trainer.mdx
@@ -2,6 +2,10 @@
 
 At TRL we plan to release several RLHF algorithms, we started our journey with PPO (Proximal Policy Optimisation) with an implementation that largely follows  the structure introduced in the paper "Fine-Tuning Language Models from Human Preferences" by D. Ziegler et al. [[paper](https://arxiv.org/pdf/1909.08593.pdf), [code](https://github.com/openai/lm-human-preferences)].
 
+## PPOConfig
+
+[[autodoc]] PPOConfig
+
 ## PPOTrainer
 
 [[autodoc]] PPOTrainer

From ebfb163125db357781fd70e3df2e428c0be6b166 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 18 Jan 2023 11:32:29 +0100
Subject: [PATCH 12/14] Apply suggestions from code review

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
---
 docs/source/models.mdx     | 2 +-
 trl/trainer/ppo_trainer.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source/models.mdx b/docs/source/models.mdx
index 7b0fe54ad8..f6ee13c6d2 100644
--- a/docs/source/models.mdx
+++ b/docs/source/models.mdx
@@ -1,6 +1,6 @@
 # Models
 
-TRL supports various model architectures including most used text generative models. 
+With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder model architectures in transformers such as GPT-2, OPT, and GPT-Neo. 
 
 ## PreTrainedModelWrapper
 
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 40beb871cc..38d15adbcd 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -227,7 +227,7 @@ def _remove_unused_columns(self, dataset: "Dataset"):
 
     def generate(self, query_tensor: torch.Tensor, **generation_kwargs):
         """
-        Generate response given the query tensor. First unwrap the model from the accelerator and then
+        Generate response with the model given the query tensor.
         call the `generate` method of the model.
 
         Args:
@@ -298,8 +298,7 @@ def step(
         scores: List[torch.FloatTensor],
     ):
         """
-        Run a PPO optimisation step given the input data. The input data is first checked for validity
-        and then the forward pass is run.
+        Run a PPO optimisation step given a list of queries, model responses, and rewards.
 
         Args:
             queries (List[`torch.LongTensor`]):

From da456cf8dd9de27efb329bfe22763ef259ad7f00 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 18 Jan 2023 10:37:46 +0000
Subject: [PATCH 13/14] clearer description

---
 docs/source/trainer.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/trainer.mdx b/docs/source/trainer.mdx
index 0d3ba16668..c52c098cdf 100644
--- a/docs/source/trainer.mdx
+++ b/docs/source/trainer.mdx
@@ -1,6 +1,7 @@
 # Trainer
 
-At TRL we plan to release several RLHF algorithms, we started our journey with PPO (Proximal Policy Optimisation) with an implementation that largely follows  the structure introduced in the paper "Fine-Tuning Language Models from Human Preferences" by D. Ziegler et al. [[paper](https://arxiv.org/pdf/1909.08593.pdf), [code](https://github.com/openai/lm-human-preferences)].
+At TRL we support PPO (Proximal Policy Optimisation) with an implementation that largely follows  the structure introduced in the paper "Fine-Tuning Language Models from Human Preferences" by D. Ziegler et al. [[paper](https://arxiv.org/pdf/1909.08593.pdf), [code](https://github.com/openai/lm-human-preferences)].
+The Trainer and model classes are largely inspired from `transformers.Trainer` and `transformers.AutoModel` classes and adapted for RL.
 
 ## PPOConfig
 

From ebf090e8d95feff62d3eaef937475388579b7730 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 18 Jan 2023 10:39:24 +0000
Subject: [PATCH 14/14] remove dashes

---
 trl/trainer/ppo_trainer.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 38d15adbcd..fa48465bc0 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -45,21 +45,21 @@ class PPOTrainer(BaseTrainer):
     The PPOTrainer uses Proximal Policy Optimization to optimise language models.
 
     Attributes:
-        - **config** (`PPOConfig`) -- Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
-        - **model** (`PreTrainedModelWrapper`) -- Model to be optimized, Hugging Face transformer model with a value head.
+        **config** (`PPOConfig`) -- Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
+        **model** (`PreTrainedModelWrapper`) -- Model to be optimized, Hugging Face transformer model with a value head.
             Check the documentation of `PreTrainedModelWrapper` for more details.
-        - **ref_model** (`PreTrainedModelWrapper`, *optional*) -- Reference model to be used for KL penalty, Hugging Face transformer model with a casual language modelling head.
+        **ref_model** (`PreTrainedModelWrapper`, *optional*) -- Reference model to be used for KL penalty, Hugging Face transformer model with a casual language modelling head.
             Check the documentation of `PreTrainedModelWrapper` for more details. If no reference model is provided, the
             trainer will create a reference model with the same architecture as the model to be optimized with shared layers.
-        - **tokenizer** (`Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`) -- Tokenizer to be used for encoding the data. Check the documentation of `transformers.PreTrainedTokenizer` and
+        **tokenizer** (`Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`) -- Tokenizer to be used for encoding the data. Check the documentation of `transformers.PreTrainedTokenizer` and
             `transformers.PreTrainedTokenizerFast` for more details.
-        - **dataset** (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*) -- PyTorch dataset or Hugging Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided,
+        **dataset** (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*) -- PyTorch dataset or Hugging Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided,
             the dataloader must be created outside the trainer users needs to design their own dataloader and make sure the batch
             size that is used is the same as the one specified in the configuration object.
-        - **optimizer** (`torch.optim.Optimizer`, *optional*) -- Optimizer to be used for training. If no optimizer is provided, the trainer will create an Adam optimizer with
+        **optimizer** (`torch.optim.Optimizer`, *optional*) -- Optimizer to be used for training. If no optimizer is provided, the trainer will create an Adam optimizer with
             the learning rate specified in the configuration object.
-        - **data_collator** (DataCollatorForLanguageModeling, *optional*) -- Data collator to be used for training and passed along the dataloader
-        - **num_shared_layers** (int, *optional*) -- Number of layers to be shared between the model and the reference model, if no reference model is passed. If no number is provided, all the layers
+        **data_collator** (DataCollatorForLanguageModeling, *optional*) -- Data collator to be used for training and passed along the dataloader
+        **num_shared_layers** (int, *optional*) -- Number of layers to be shared between the model and the reference model, if no reference model is passed. If no number is provided, all the layers
             will be shared.
     """