volcengine · vermouth1992 · Jan 27, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml
@@ -43,4 +43,12 @@ jobs:
       - name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm
         run: |
           ray stop --force
-          bash tests/sft/run_sft.sh
+          bash tests/sft/run_sft.sh
+      - name: Running gsm8k e2e training tests on 8 L20 GPUs with sequence parallism
+        run: |
+          ray stop --force
+          bash examples/sft/gsm8k/run_qwen_05_sp2.sh 8 $HOME/ckpts/
+      - name: Check loss difference between sequence parallel vs. default implementation
+        run: |
+          ray stop --force
+          bash tests/sft/run_sft_sp_loss_match.sh
diff --git a/examples/sft/gsm8k/run_qwen_05_sp2.sh b/examples/sft/gsm8k/run_qwen_05_sp2.sh
@@ -0,0 +1,32 @@
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.fsdp_sft_trainer \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.prompt_key=extra_info \
+    data.response_key=extra_info \
+    optim.lr=1e-4 \
+    +data.prompt_dict_keys=['question'] \
+    +data.response_dict_keys=['answer'] \
+    data.micro_batch_size=4 \
+    model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=gsm8k-sft \
+    trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2 \
+    trainer.logger=['console'] \
+    trainer.total_training_steps=1 \
+    trainer.default_hdfs_dir=null $@ \
+    ulysses_sequence_parallel_size=2 \
+    use_remove_padding=true
diff --git a/tests/sft/run_sft_sp_loss_match.sh b/tests/sft/run_sft_sp_loss_match.sh
@@ -0,0 +1,24 @@
+# Tested with 2 & 4 GPUs
+
+set -x
+
+torchrun --standalone --nnodes=1 --nproc_per_node=8 \
+    tests/sft/test_sp_loss_match.py \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.prompt_key=extra_info \
+    data.response_key=extra_info \
+    +data.prompt_dict_keys=['question'] \
+    +data.response_dict_keys=['answer'] \
+    data.micro_batch_size=32 \
+    model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
+    ulysses_sequence_parallel_size=2 \
+    use_remove_padding=True \
+    trainer.default_local_dir=$HOME/ckpts/ \
+    trainer.project_name=qwen2.5-sft \
+    trainer.experiment_name=gsm8k-sft-gemma-2b-it \
+    trainer.total_training_steps=1 \
+    trainer.logger=['console'] \
+    trainer.default_hdfs_dir=null $@
+
+rm -rf $HOME/ckpts/
diff --git a/tests/sft/test_sp_loss_match.py b/tests/sft/test_sp_loss_match.py
@@ -0,0 +1,128 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+from tensordict import TensorDict
+from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer
+from torch.distributed.device_mesh import init_device_mesh
+from verl.utils.distributed import initialize_global_process_group
+
+
+def test_trainer_forward_consistency(trainer: FSDPSFTTrainer, total_steps: int = 4):
+    """Test consistency between original forward pass and SP+rmpad forward passes.
+
+    Args:
+        trainer: The FSDPSFTTrainer instance to test
+        total_steps: Number of steps to test (default: 4)
+    """
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nStarting debug comparison between original and SP+rmpad forward passes...")
+        print(f"Sequence parallel size: {trainer.config.ulysses_sequence_parallel_size}")
+        print(f"Remove padding: {trainer.use_remove_padding}\n")
+
+    steps_remaining = total_steps
+
+    for epoch in range(1):  # Just one epoch for testing
+        trainer.train_sampler.set_epoch(epoch=epoch)
+        for data in trainer.train_dataloader:
+            data = TensorDict(data, batch_size=trainer.config.data.train_batch_size).cuda()
+            trainer.fsdp_model.train()
+            micro_batches = data.split(trainer.config.data.micro_batch_size)
+
+            for idx, micro_batch in enumerate(micro_batches):
+                if trainer.device_mesh.get_rank() == 0:
+                    print(f"\nProcessing micro batch {idx + 1}/{len(micro_batches)}")
+
+                # Compute losses using both methods
+                # Disable SP and rmpad
+                trainer.use_remove_padding = False
+                old_sp = trainer.config.ulysses_sequence_parallel_size
+                trainer.config.ulysses_sequence_parallel_size = 1
+                loss_ref = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+
+                # Do SP and rmpad
+                trainer.config.ulysses_sequence_parallel_size = old_sp
+                trainer.use_remove_padding = True
+                loss_sp = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+
+                # Collect losses across all ranks
+                loss_ref_all = loss_ref.clone()
+                loss_sp_all = loss_sp.clone()
+                torch.distributed.all_reduce(loss_ref_all, op=torch.distributed.ReduceOp.AVG)
+                torch.distributed.all_reduce(loss_sp_all, op=torch.distributed.ReduceOp.AVG)
+
+                # Calculate relative difference of averaged losses
+                rel_diff = torch.abs(loss_ref_all - loss_sp_all) / (torch.abs(loss_ref_all) + 1e-8)
+
+                if trainer.device_mesh.get_rank() == 0:
+                    print("\nComparison Results (Averaged across ranks):")
+                    print(f"Reference Loss: {loss_ref_all.item():.6f}")
+                    print(f"SP+rmpad Loss: {loss_sp_all.item():.6f}")
+                    print(f"Relative Difference: {rel_diff.item():.6f}")
+
+                    assert rel_diff.item() < 1e-2, "Significant difference detected between averaged losses!"
+                    print("Loss difference is within the acceptable range.")
+
+                steps_remaining -= 1
+                if steps_remaining == 0:
+                    break
+            if steps_remaining == 0:
+                break
+        break
+
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nDebug comparison completed successfully.")
+
+
+def create_trainer(config):
+    """Create and initialize a trainer instance with the given config.
+
+    Args:
+        config: Configuration object with training parameters
+
+    Returns:
+        FSDPSFTTrainer: Initialized trainer instance
+    """
+    local_rank, rank, world_size = initialize_global_process_group()
+
+    device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('fsdp',))
+
+    dp_size = world_size // config.ulysses_sequence_parallel_size
+    ulysses_device_mesh = init_device_mesh(device_type='cuda',
+                                           mesh_shape=(dp_size, config.ulysses_sequence_parallel_size),
+                                           mesh_dim_names=('dp', 'sp'))
+
+    return FSDPSFTTrainer(config=config, device_mesh=device_mesh, ulysses_device_mesh=ulysses_device_mesh)
+
+
+def main(config):
+    """Main function to run trainer tests.
+
+    Args:
+        config: Configuration object with training parameters
+    """
+    trainer = create_trainer(config)
+    test_trainer_forward_consistency(trainer)
+
+
+if __name__ == '__main__':
+    import hydra
+    from omegaconf import DictConfig
+
+    @hydra.main(config_path="../../verl/trainer/config", config_name="sft_trainer")
+    def hydra_entry(cfg: DictConfig) -> None:
+        main(cfg)
+
+    hydra_entry()
diff --git a/verl/trainer/config/sft_trainer.yaml b/verl/trainer/config/sft_trainer.yaml
@@ -22,14 +22,15 @@ model:
   trust_remote_code: False
   lora_rank: 0  # Set to positive value to enable LoRA (e.g., 32)
   lora_alpha: 16  # LoRA scaling factor
-  target_modules: [q_proj, v_proj]  # Target modules for LoRA adaptation
+  target_modules: all-linear  # Target modules for LoRA adaptation
 optim:
   lr: 1e-5
   betas: [0.9, 0.95]
   weight_decay: 0.01
   warmup_steps_ratio: 0.1
   clip_grad: 1.0
-
+ulysses_sequence_parallel_size: 1
+use_remove_padding: False
 trainer:
   default_local_dir: /tmp/sft_model
   default_hdfs_dir: hdfs://tmp/experiments/gsm8k/gemma-1.1-7b-it/ # change the hdfs path here