Support GRPO advantage estimate weighting

luciaquirke · luciaquirke · commit 43993eb58906 · 2025-09-10T21:35:20.000Z
diff --git a/bergson/build.py b/bergson/build.py
@@ -3,6 +3,7 @@
 from datetime import timedelta
 from typing import cast
 
+import pandas as pd
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -80,7 +81,7 @@ def worker(rank: int, world_size: int, cfg: IndexConfig, ds: Dataset | IterableD
             cfg.model,
             device_map=device_map,
             quantization_config=quantization_config,
-            torch_dtype=dtype,
+            dtype=dtype,
             revision=cfg.revision,
         )
         target_modules = None
@@ -91,7 +92,7 @@ def worker(rank: int, world_size: int, cfg: IndexConfig, ds: Dataset | IterableD
             peft_config.base_model_name_or_path,  # type: ignore
             device_map=device_map,
             quantization_config=quantization_config,
-            torch_dtype=dtype,
+            dtype=dtype,
             revision=cfg.revision,
         )
 
@@ -190,6 +191,20 @@ def dist_worker(rank: int, world_size: int, cfg: IndexConfig, ds: Dataset):
         dist.destroy_process_group()
 
 
+def estimate_advantage(ds: Dataset, cfg: IndexConfig):
+    """Group rollouts by prompt and estimate advantages."""
+    assert isinstance(ds, Dataset), "Dataset required for advantage estimation"
+
+    df = ds.select_columns([cfg.data.prompt_column, cfg.data.reward_column]).to_pandas()
+    df = assert_type(pd.DataFrame, df)
+
+    advantages = df[cfg.data.reward_column] - df.groupby(cfg.data.prompt_column)[
+        cfg.data.reward_column
+    ].transform("mean")
+
+    return advantages.tolist()
+
+
 def build_gradient_dataset(cfg: IndexConfig):
     # In many cases the token_batch_size may be smaller than the max length allowed by
     # the model. If cfg.data.truncation is True, we use the tokenizer to truncate
@@ -206,6 +221,13 @@ def build_gradient_dataset(cfg: IndexConfig):
         fn_kwargs=dict(args=cfg.data, tokenizer=tokenizer),
         remove_columns=remove_columns,
     )
+    if cfg.data.reward_column:
+        ds = ds.add_column(
+            "advantage",
+            estimate_advantage(ds, cfg),
+            new_fingerprint="advantage",  # type: ignore
+        )
+
     world_size = torch.cuda.device_count()
     if world_size <= 1:
         # Run the worker directly if no distributed training is needed. This is great
diff --git a/bergson/collection.py b/bergson/collection.py
@@ -107,6 +107,9 @@ def callback(name: str, g: torch.Tensor):
                 # Compute average KL across all unmasked tokens
                 kls = torch.sum(ft_lps.exp() * (ft_lps - ref_lps), dim=-1)
                 losses = torch.sum(kls * masks, dim=-1) / denoms
+                if "advantage" in batch:
+                    losses *= torch.tensor(batch["advantage"], device=losses.device)
+
                 losses.mean().backward()
         else:
             with collector:
@@ -118,6 +121,9 @@ def callback(name: str, g: torch.Tensor):
                     reduction="none",
                 ).reshape_as(y[:, 1:])
                 losses = losses.sum(1) / denoms
+                if "advantage" in batch:
+                    losses *= torch.tensor(batch["advantage"], device=losses.device)
+
                 losses.mean().backward()
 
         # Weirdly you need to explicitly synchronize here in order to make sure that
diff --git a/bergson/data.py b/bergson/data.py
@@ -41,6 +41,11 @@ class DataConfig:
     conversation_column: str = ""
     """Optional column in the dataset that contains the conversation."""
 
+    reward_column: str = ""
+    """Optional column in the dataset that contains the rewards.
+    When specified, gradients are calculated using the policy
+    gradient loss from Dr. GRPO. https://arxiv.org/abs/2503.20783"""
+
     truncation: bool = False
     """Whether to truncate long documents to fit the token budget."""