From 288134085f6a788f3551b15474719e1f9c7a8ebc Mon Sep 17 00:00:00 2001
From: Ishita Bhattacharyya <intransigent.sophist@gmail.com>
Date: Fri, 31 Oct 2025 01:51:22 +0530
Subject: [PATCH 1/3] Consolidate slow tests into main test files

Moved slow test cases from tests/slow/ directory into their respective
main test files and added missing imports (gc, BitsAndBytesConfig).
---
 tests/slow/test_dpo_slow.py  | 213 --------------
 tests/slow/test_grpo_slow.py | 554 -----------------------------------
 tests/slow/test_sft_slow.py  | 467 -----------------------------
 tests/test_dpo_trainer.py    | 200 ++++++++++++-
 tests/test_grpo_trainer.py   | 530 ++++++++++++++++++++++++++++++++-
 tests/test_sft_trainer.py    | 437 ++++++++++++++++++++++++++-
 6 files changed, 1162 insertions(+), 1239 deletions(-)
 delete mode 100644 tests/slow/test_dpo_slow.py
 delete mode 100644 tests/slow/test_grpo_slow.py
 delete mode 100755 tests/slow/test_sft_slow.py

diff --git a/tests/slow/test_dpo_slow.py b/tests/slow/test_dpo_slow.py
deleted file mode 100644
index 03c2c60abc8..00000000000
--- a/tests/slow/test_dpo_slow.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import torch
-from accelerate.utils.memory import release_memory
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from transformers.testing_utils import backend_empty_cache, torch_device
-from transformers.utils import is_peft_available
-
-from trl import DPOConfig, DPOTrainer
-
-from ..testing_utils import TrlTestCase, require_bitsandbytes, require_peft, require_torch_accelerator
-from .testing_constants import DPO_LOSS_TYPES, DPO_PRECOMPUTE_LOGITS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST
-
-
-if is_peft_available():
-    from peft import LoraConfig, PeftModel
-
-
-@pytest.mark.slow
-@require_torch_accelerator
-@require_peft
-class TestDPOTrainerSlow(TrlTestCase):
-    def setup_method(self):
-        self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
-        self.peft_config = LoraConfig(
-            lora_alpha=16,
-            lora_dropout=0.1,
-            r=8,
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
-        self.max_length = 128
-
-    def teardown_method(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
-    def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits):
-        """
-        A test that tests the simple usage of `DPOTrainer` using a bare model in full precision.
-        """
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
-
-        training_args = DPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=2,
-            max_steps=2,
-            remove_unused_columns=False,
-            gradient_accumulation_steps=2,
-            learning_rate=9e-1,
-            eval_strategy="steps",
-            fp16=True,
-            logging_strategy="no",
-            report_to="none",
-            beta=0.1,
-            loss_type=loss_type,
-            precompute_ref_log_probs=pre_compute_logits,
-            max_length=self.max_length,
-        )
-
-        # dpo train lora model
-        trainer = DPOTrainer(
-            model=model,
-            ref_model=None,
-            args=training_args,
-            train_dataset=self.dataset["train"],
-            eval_dataset=self.dataset["test"],
-            processing_class=tokenizer,
-        )
-
-        # train the model
-        trainer.train()
-
-        # save trained model or adapter
-        trainer.save_model()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
-    @require_peft
-    def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
-        """
-        A test that tests the simple usage of `DPOTrainer` using a peft model in full precision + different scenarios
-        of gradient checkpointing.
-        """
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
-
-        training_args = DPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=2,
-            max_steps=2,
-            remove_unused_columns=False,
-            gradient_accumulation_steps=2,
-            learning_rate=9e-1,
-            eval_strategy="steps",
-            fp16=True,
-            logging_strategy="no",
-            report_to="none",
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-            generate_during_eval=False,
-            loss_type=loss_type,
-            precompute_ref_log_probs=pre_compute_logits,
-            beta=0.1,
-            max_length=self.max_length,
-        )
-
-        # dpo train lora model
-        trainer = DPOTrainer(
-            model=model,
-            ref_model=None,
-            args=training_args,
-            train_dataset=self.dataset["train"],
-            eval_dataset=self.dataset["test"],
-            processing_class=tokenizer,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-        assert trainer.ref_model is None
-
-        # train the model
-        trainer.train()
-
-        # save trained model or adapter
-        trainer.save_model()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
-    @require_bitsandbytes
-    @require_peft
-    def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
-        """
-        A test that tests the simple usage of `DPOTrainer` using QLoRA + different scenarios of gradient checkpointing.
-        """
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
-
-        training_args = DPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=2,
-            max_steps=2,
-            remove_unused_columns=False,
-            gradient_accumulation_steps=2,
-            learning_rate=9e-1,
-            eval_strategy="steps",
-            fp16=True,
-            logging_strategy="no",
-            report_to="none",
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-            beta=0.1,
-            generate_during_eval=False,
-            loss_type=loss_type,
-            precompute_ref_log_probs=pre_compute_logits,
-            max_length=self.max_length,
-        )
-
-        # dpo train lora model
-        trainer = DPOTrainer(
-            model=model,
-            ref_model=None,
-            args=training_args,
-            train_dataset=self.dataset["train"],
-            eval_dataset=self.dataset["test"],
-            processing_class=tokenizer,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-        assert trainer.ref_model is None
-
-        # train the model
-        trainer.train()
-
-        # save trained model or adapter
-        trainer.save_model()
-
-        release_memory(model, trainer)
diff --git a/tests/slow/test_grpo_slow.py b/tests/slow/test_grpo_slow.py
deleted file mode 100644
index bf63984d645..00000000000
--- a/tests/slow/test_grpo_slow.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import os
-import warnings
-
-import numpy as np
-import pytest
-import torch
-import transformers
-from accelerate.utils.memory import release_memory
-from datasets import Dataset, Features, Image, Value, load_dataset
-from packaging.version import Version
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForImageTextToText,
-    AutoProcessor,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-)
-from transformers.testing_utils import backend_empty_cache, torch_device
-from transformers.utils import is_peft_available
-
-from trl import GRPOConfig, GRPOTrainer
-from trl.trainer.utils import get_kbit_device_map
-
-from ..testing_utils import (
-    TrlTestCase,
-    require_bitsandbytes,
-    require_flash_attn,
-    require_liger_kernel,
-    require_peft,
-    require_torch_accelerator,
-    require_vllm,
-)
-from .testing_constants import MODELS_TO_TEST
-
-
-if is_peft_available():
-    from peft import LoraConfig, PeftModel
-
-
-@pytest.mark.slow
-@require_torch_accelerator
-class TestGRPOTrainerSlow(TrlTestCase):
-    def setup_method(self):
-        self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-        self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test")
-        self.max_length = 128
-
-    def teardown_method(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_liger_kernel
-    def test_training_with_liger_grpo_loss(self, model_name):
-        training_args = GRPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=3,
-            num_generations=3,
-            use_liger_loss=True,
-            max_completion_length=self.max_length,
-            report_to="none",
-            logging_strategy="no",
-            loss_type="bnpo",  # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
-
-        trainer = GRPOTrainer(
-            model=model,
-            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
-            args=training_args,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            processing_class=tokenizer,
-        )
-        from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
-
-        assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
-
-        previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
-
-        trainer.train()
-
-        for n, param in previous_trainable_params.items():
-            new_param = model.get_parameter(n)
-            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_liger_kernel
-    @require_peft
-    def test_training_with_liger_grpo_loss_and_peft(self, model_name):
-        from peft import LoraConfig, TaskType
-
-        training_args = GRPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=3,
-            num_generations=3,
-            use_liger_loss=True,
-            max_completion_length=self.max_length,
-            report_to="none",
-            logging_strategy="no",
-            loss_type="bnpo",  # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
-
-        # Configure PEFT with LoRA
-        peft_config = LoraConfig(
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=False,
-            r=8,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            target_modules=["q_proj", "v_proj"],
-        )
-
-        trainer = GRPOTrainer(
-            model=model,
-            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
-            args=training_args,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            processing_class=tokenizer,
-            peft_config=peft_config,
-        )
-        from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
-
-        assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
-
-        # Verify PEFT adapter is properly initialized
-        from peft import PeftModel
-
-        assert isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT"
-
-        # Store adapter weights before training
-        previous_trainable_params = {
-            n: param.clone() for n, param in trainer.model.named_parameters() if param.requires_grad
-        }
-        assert len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model"
-
-        trainer.train()
-
-        # Verify adapter weights have changed after training
-        for n, param in previous_trainable_params.items():
-            new_param = trainer.model.get_parameter(n)
-            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    def test_training_with_transformers_paged(self, model_name):
-        """Test that training works with transformers paged implementation (requires GPU)."""
-        if Version(transformers.__version__) < Version("4.57.0"):
-            pytest.xfail("Upstream bug in transformers (GH#40692). Fix merged; awaiting release >= 4.57.0")
-        training_args = GRPOConfig(
-            output_dir=self.tmp_dir,
-            learning_rate=0.1,  # increase the learning rate to speed up the test
-            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
-            num_generations=3,  # reduce the number of generations to reduce memory usage
-            max_completion_length=8,  # reduce the completion length to reduce memory usage
-            use_transformers_paged=True,  # Enable transformers paged implementation
-            report_to="none",
-            logging_strategy="no",
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-
-        trainer = GRPOTrainer(
-            model=model,
-            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
-            args=training_args,
-            train_dataset=self.train_dataset,
-        )
-
-        previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
-
-        trainer.train()
-
-        assert trainer.state.log_history[-1]["train_loss"] is not None
-
-        # Check that the params have changed
-        for n, param in previous_trainable_params.items():
-            new_param = model.get_parameter(n)
-            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize(
-        "model_name",
-        [
-            "HuggingFaceTB/SmolVLM-Instruct",  # Only test the smaller model to avoid OOM
-        ],
-    )
-    @require_flash_attn
-    @require_bitsandbytes
-    @require_peft
-    def test_vlm_training(self, model_name):
-        """
-        Test VLM training with aggressive memory optimization.
-
-        This test uses multiple memory reduction techniques:
-        - 4-bit quantization with double quantization
-        - LoRA with very low rank (r=4)
-        - Minimal batch size (1) with gradient accumulation
-        - Small images (64x64 instead of 224x224)
-        - Short sequences (max_completion_length=8)
-        - Only 4 training samples
-        - Only 1 training step
-        - Gradient checkpointing and bfloat16
-        """
-
-        # Create processor once outside the data generator
-        processor = AutoProcessor.from_pretrained(model_name, use_fast=True, padding_side="left")
-        conversation = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": "What is in the image?"},
-                ],
-            },
-        ]
-        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-
-        def data_gen(num_samples):
-            for _ in range(num_samples):
-                yield {
-                    "prompt": prompt,
-                    "image": np.random.uniform(low=0.0, high=255.0, size=(64, 64, 3)).astype(
-                        np.uint8
-                    ),  # Much smaller images
-                }
-
-        dataset = Dataset.from_generator(
-            data_gen, gen_kwargs={"num_samples": 4}, features=Features(image=Image(), prompt=Value(dtype="string"))
-        )
-        # reduce memory requirements as much as possible
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype="bfloat16",
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_storage="bfloat16",
-        )
-        model = AutoModelForImageTextToText.from_pretrained(
-            model_name,
-            attn_implementation="flash_attention_2",
-            dtype="bfloat16",
-            device_map=get_kbit_device_map(),
-            quantization_config=quantization_config,
-        )
-
-        def reward_func(prompts, completions, **kwargs):
-            # simple nonsensical reward
-            return [-((len(c) - 25) ** 2) + 100 for c in completions]
-
-        training_args = GRPOConfig(
-            output_dir=self.tmp_dir,
-            learning_rate=0.1,
-            per_device_train_batch_size=1,  # Minimal batch size
-            gradient_accumulation_steps=2,  # Maintain effective batch size
-            num_generations=2,
-            max_completion_length=8,  # Much shorter completions
-            max_prompt_length=None,  # Don't limit prompt length for VLM
-            bf16=True,  # Use bfloat16 precision
-            max_steps=1,  # Only do 1 training step to save time and memory
-            report_to="none",
-            logging_strategy="no",
-        )
-        lora_config = LoraConfig(
-            task_type="CAUSAL_LM",
-            r=4,  # Much lower rank for minimal memory
-            lora_alpha=8,  # Reduced alpha proportionally
-            lora_dropout=0.1,
-            target_modules=["q_proj", "v_proj"],  # Minimal target modules
-            # For VLM models, we typically want to freeze the vision encoder
-            # and only adapt the language model parameters
-            modules_to_save=None,
-        )
-
-        try:
-            trainer = GRPOTrainer(
-                model=model,
-                processing_class=processor,
-                reward_funcs=[reward_func],
-                args=training_args,
-                train_dataset=dataset,
-                peft_config=lora_config,
-            )
-
-            assert isinstance(trainer.model, PeftModel)
-
-            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
-
-            trainer.train()
-
-            assert trainer.state.log_history[-1]["train_loss"] is not None
-
-            # Check that LoRA parameters have changed
-            # For VLM models, we're more permissive about which parameters can change
-            lora_params_changed = False
-            for n, param in previous_trainable_params.items():
-                new_param = trainer.model.get_parameter(n)
-                if "lora" in n.lower():  # LoRA parameters should change
-                    if not torch.equal(param, new_param):
-                        lora_params_changed = True
-
-            # At least some LoRA parameters should have changed during training
-            assert lora_params_changed, "No LoRA parameters were updated during training."
-
-        except torch.OutOfMemoryError as e:
-            pytest.skip(f"Skipping VLM training test due to insufficient GPU memory: {e}")
-        except Exception as e:
-            # Check for other memory-related errors
-            if any(keyword in str(e).lower() for keyword in ["memory", "cuda", "out of memory", "insufficient"]):
-                pytest.skip(f"Skipping VLM training test due to hardware constraints: {e}")
-            else:
-                raise
-
-        release_memory(model, trainer)
-
-    @require_vllm
-    @require_bitsandbytes
-    @require_peft
-    def test_vlm_processor_vllm_colocate_mode(self):
-        """
-        Test that VLM processors work with vLLM in colocate mode.
-
-        This test uses multiple memory optimization techniques to ensure it runs on limited hardware:
-        - LoRA (Low-Rank Adaptation) with minimal rank (r=4)
-        - 4-bit quantization with BitsAndBytesConfig
-        - Gradient checkpointing
-        - bfloat16 precision
-        - Minimal batch sizes and sequence lengths
-        - Very low GPU memory utilization (5%)
-        """
-        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-
-        config = GRPOConfig(
-            output_dir=self.tmp_dir,
-            per_device_train_batch_size=1,  # Minimal batch size
-            gradient_accumulation_steps=2,  # Make effective batch size 2, divisible by num_generations
-            num_generations=2,
-            max_completion_length=4,  # Very short completions to reduce memory
-            max_prompt_length=32,  # Very short prompts to reduce memory
-            use_vllm=True,  # Enable vLLM
-            vllm_mode="colocate",  # Use colocate mode to avoid server dependency
-            vllm_gpu_memory_utilization=0.05,  # Use minimal GPU memory (5%)
-            gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
-            bf16=True,  # Use bfloat16 to reduce memory
-            report_to="none",
-            logging_strategy="no",
-        )
-
-        # Create a VLM processor
-        processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", use_fast=True, padding_side="left")
-
-        # Verify processor has both required attributes for VLM detection
-        assert hasattr(processor, "tokenizer")
-        assert hasattr(processor, "image_processor")
-
-        def dummy_reward_func(completions, **kwargs):
-            return [1.0] * len(completions)
-
-        # Use LoRA configuration for memory efficiency
-        lora_config = LoraConfig(
-            r=4,  # Very low rank for minimal memory
-            lora_alpha=8,
-            target_modules=["q_proj", "v_proj"],  # Minimal target modules
-            lora_dropout=0.1,
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
-
-        # Use 4-bit quantization for further memory reduction
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-        )
-
-        original_env = {}
-        required_env_vars = {
-            "RANK": "0",
-            "LOCAL_RANK": "0",
-            "WORLD_SIZE": "1",
-            "LOCAL_WORLD_SIZE": "1",
-            "MASTER_ADDR": "localhost",
-            "MASTER_PORT": "12355",
-        }
-
-        for key, value in required_env_vars.items():
-            original_env[key] = os.environ.get(key)
-            os.environ[key] = value
-
-        try:
-            # Test VLM processor with vLLM colocate mode
-            with warnings.catch_warnings(record=True) as w:
-                warnings.simplefilter("always")
-                try:
-                    # Load model with quantization for memory efficiency
-                    model = AutoModelForCausalLM.from_pretrained(
-                        "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
-                        quantization_config=quantization_config,
-                        dtype=torch.bfloat16,
-                    )
-
-                    trainer = GRPOTrainer(
-                        model=model,
-                        reward_funcs=dummy_reward_func,
-                        args=config,
-                        train_dataset=dataset,
-                        processing_class=processor,  # VLM processor
-                        peft_config=lora_config,  # Use LoRA for memory efficiency
-                    )
-
-                    # Should detect VLM processor correctly and allow vLLM
-                    assert trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode"
-                    assert trainer.vllm_mode == "colocate", "Should use colocate mode"
-
-                    # Check if signature columns were set properly
-                    if trainer._signature_columns is not None:
-                        # Should include 'image' in signature columns for VLM processors
-                        assert "image" in trainer._signature_columns, (
-                            "Should include 'image' in signature columns for VLM"
-                        )
-
-                    # Should not emit any warnings about VLM incompatibility
-                    incompatibility_warnings = [
-                        str(w_item.message)
-                        for w_item in w
-                        if "does not support VLMs" in str(w_item.message)
-                        or "not compatible" in str(w_item.message).lower()
-                    ]
-                    assert len(incompatibility_warnings) == 0, (
-                        f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}"
-                    )
-
-                    # Test passes if we get this far without exceptions
-
-                except Exception as e:
-                    # If vLLM fails to initialize due to hardware constraints or other issues, that's expected
-                    if any(
-                        keyword in str(e).lower()
-                        for keyword in [
-                            "outofmemoryerror",
-                            "cuda",
-                            "memory",
-                            "insufficient",
-                            "no such device",
-                            "free memory",
-                            "gpu memory utilization",
-                            "decrease gpu memory",
-                        ]
-                    ):
-                        pytest.skip(f"Skipping vLLM colocate test due to hardware constraints: {e}")
-                    elif "KeyError" in str(e) and "RANK" in str(e):
-                        pytest.skip(f"Skipping vLLM colocate test due to environment setup issues: {e}")
-                    elif "ValueError" in str(e) and "memory" in str(e).lower():
-                        pytest.skip(f"Skipping vLLM colocate test due to memory constraints: {e}")
-                    else:
-                        raise
-        finally:
-            # Restore original environment variables
-            for key, original_value in original_env.items():
-                if original_value is None:
-                    os.environ.pop(key, None)
-                else:
-                    os.environ[key] = original_value
-
-            release_memory(model, trainer)
-
-    @require_vllm
-    def test_training_vllm(self):
-        """Test that training works with vLLM for generation."""
-        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-
-        training_args = GRPOConfig(
-            output_dir=self.tmp_dir,
-            learning_rate=0.1,  # increase the learning rate to speed up the test
-            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
-            num_generations=3,  # reduce the number of generations to reduce memory usage
-            max_completion_length=8,  # reduce the completion length to reduce memory usage
-            report_to="none",
-            logging_strategy="no",
-            use_vllm=True,
-        )
-
-        try:
-            trainer = GRPOTrainer(
-                model="Qwen/Qwen2.5-0.5B-Instruct",  # tiny models are too small for vLLM
-                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
-                args=training_args,
-                train_dataset=dataset,
-            )
-
-            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
-
-            trainer.train()
-
-            assert trainer.state.log_history[-1]["train_loss"] is not None
-
-            # Check that the params have changed
-            for n, param in previous_trainable_params.items():
-                new_param = trainer.model.get_parameter(n)
-                assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
-
-        except Exception as e:
-            # If vLLM fails to initialize due to hardware constraints or other issues, that's expected
-            if any(
-                keyword in str(e).lower()
-                for keyword in [
-                    "outofmemoryerror",
-                    "cuda",
-                    "memory",
-                    "insufficient",
-                    "no such device",
-                    "free memory",
-                    "gpu memory utilization",
-                    "decrease gpu memory",
-                ]
-            ):
-                pytest.skip(f"Skipping vLLM training test due to hardware constraints: {e}")
-            elif "KeyError" in str(e) and "RANK" in str(e):
-                pytest.skip(f"Skipping vLLM training test due to environment setup issues: {e}")
-            elif "ValueError" in str(e) and "memory" in str(e).lower():
-                pytest.skip(f"Skipping vLLM training test due to memory constraints: {e}")
-            else:
-                raise
-
-        release_memory(trainer.model, trainer)
diff --git a/tests/slow/test_sft_slow.py b/tests/slow/test_sft_slow.py
deleted file mode 100755
index dddf124e947..00000000000
--- a/tests/slow/test_sft_slow.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import torch
-from accelerate.utils.memory import release_memory
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from transformers.testing_utils import backend_empty_cache, torch_device
-from transformers.utils import is_peft_available
-
-from trl import SFTConfig, SFTTrainer
-
-from ..testing_utils import (
-    TrlTestCase,
-    require_bitsandbytes,
-    require_liger_kernel,
-    require_peft,
-    require_torch_accelerator,
-    require_torch_multi_accelerator,
-)
-from .testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS
-
-
-if is_peft_available():
-    from peft import LoraConfig, PeftModel
-
-
-@pytest.mark.slow
-@require_torch_accelerator
-@require_peft
-class TestSFTTrainerSlow(TrlTestCase):
-    def setup_method(self):
-        self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]")
-        self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]")
-        self.max_length = 128
-        self.peft_config = LoraConfig(
-            lora_alpha=16,
-            lora_dropout=0.1,
-            r=8,
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
-
-    def teardown_method(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    def test_sft_trainer_str(self, model_name, packing):
-        """
-        Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-        )
-
-        trainer = SFTTrainer(
-            model_name,
-            args=training_args,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        trainer.train()
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    def test_sft_trainer_transformers(self, model_name, packing):
-        """
-        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_peft
-    def test_sft_trainer_peft(self, model_name, packing):
-        """
-        Simply tests if passing a transformers model + peft config to `SFTTrainer` loads and runs the trainer as
-        expected.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            fp16=True,
-            packing=packing,
-            max_length=self.max_length,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    def test_sft_trainer_transformers_mp(self, model_name, packing):
-        """
-        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
-        precision.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            fp16=True,  # this is sufficient to enable amp
-            packing=packing,
-            max_length=self.max_length,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs):
-        """
-        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
-        precision + different scenarios of gradient_checkpointing.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-            fp16=True,  # this is sufficient to enable amp
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_peft
-    def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs):
-        """
-        Simply tests if passing a transformers model + PEFT to `SFTTrainer` loads and runs the trainer as expected in
-        mixed precision + different scenarios of gradient_checkpointing.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-            fp16=True,  # this is sufficient to enable amp
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS)
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_torch_multi_accelerator
-    def test_sft_trainer_transformers_mp_gc_device_map(
-        self, model_name, packing, gradient_checkpointing_kwargs, device_map
-    ):
-        """
-        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
-        precision + different scenarios of gradient_checkpointing (single, multi-gpu, etc).
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-            fp16=True,  # this is sufficient to enable amp
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_peft
-    @require_bitsandbytes
-    def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs):
-        """
-        Simply tests if passing a transformers model + PEFT + bnb to `SFTTrainer` loads and runs the trainer as
-        expected in mixed precision + different scenarios of gradient_checkpointing.
-        """
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            packing=packing,
-            max_length=self.max_length,
-            fp16=True,  # this is sufficient to enable amp
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
-        )
-
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-
-        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_peft
-    @require_bitsandbytes
-    def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
-        """
-        Simply tests if using setup_chat_format with a transformers model + peft + bnb config to `SFTTrainer` loads and
-        runs the trainer as expected.
-        """
-        train_dataset = load_dataset("trl-internal-testing/dolly-chatml-sft", split="train")
-
-        training_args = SFTConfig(
-            packing=packing,
-            max_length=self.max_length,
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=10,
-            fp16=True,
-        )
-
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-
-        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        trainer = SFTTrainer(
-            model,
-            args=training_args,
-            processing_class=tokenizer,
-            train_dataset=train_dataset,
-            peft_config=self.peft_config,
-        )
-
-        assert isinstance(trainer.model, PeftModel)
-
-        trainer.train()
-
-        release_memory(model, trainer)
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_liger_kernel
-    def test_sft_trainer_with_liger(self, model_name, packing):
-        """
-        Tests if passing use_liger=True to SFTConfig loads and runs the trainer with AutoLigerKernelForCausalLM as
-        expected.
-        """
-        import importlib
-
-        def cleanup_liger_patches(trainer):
-            """Clean up liger_kernel patches by reloading the model's specific module"""
-            try:
-                # Get the specific module that was used by the trainer's model
-                module_path = trainer.model.__module__
-                reload_module = importlib.import_module(module_path)
-                importlib.reload(reload_module)
-            except Exception:
-                pass  # Continue if reload fails
-
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            logging_strategy="no",
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=2,
-            packing=packing,
-            max_length=self.max_length,
-            use_liger_kernel=True,
-        )
-
-        trainer = SFTTrainer(
-            model_name,
-            args=training_args,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-        )
-
-        # Ensure cleanup of liger patches after the test
-        try:
-            trainer.train()
-            release_memory(trainer.model, trainer)
-        finally:
-            cleanup_liger_patches(trainer)
-
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-    @require_torch_accelerator
-    def test_train_offloading(self, model_name, packing):
-        """Test that activation offloading works with SFTTrainer."""
-        # Initialize the trainer
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            activation_offloading=True,
-            report_to="none",
-            per_device_train_batch_size=2,
-            max_steps=2,
-            packing=packing,
-            max_length=self.max_length,
-        )
-        trainer = SFTTrainer(
-            model=model_name, args=training_args, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset
-        )
-
-        # Save the initial parameters to compare them later
-        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
-
-        # Train the model
-        trainer.train()
-
-        # Check that the training loss is not None
-        assert trainer.state.log_history[-1]["train_loss"] is not None
-
-        # Check the params have changed
-        for n, param in previous_trainable_params.items():
-            new_param = trainer.model.get_parameter(n)
-            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
-
-        release_memory(trainer.model, trainer)
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index fa7038167d4..212ba644591 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import re
 import sys
 from unittest.mock import MagicMock
@@ -19,6 +20,7 @@
 import numpy as np
 import pytest
 import torch
+from accelerate.utils.memory import release_memory
 from datasets import Dataset, features, load_dataset
 from transformers import (
     AutoModelForCausalLM,
@@ -26,19 +28,28 @@
     AutoModelForSeq2SeqLM,
     AutoProcessor,
     AutoTokenizer,
+    BitsAndBytesConfig,
     PreTrainedTokenizerBase,
     is_vision_available,
 )
-from transformers.testing_utils import get_device_properties
+from transformers.testing_utils import backend_empty_cache, get_device_properties, torch_device
+from transformers.utils import is_peft_available
 
 from trl import DPOConfig, DPOTrainer, FDivergenceType
 
+from .slow.testing_constants import (
+    DPO_LOSS_TYPES,
+    DPO_PRECOMPUTE_LOGITS,
+    GRADIENT_CHECKPOINTING_KWARGS,
+    MODELS_TO_TEST,
+)
 from .testing_utils import (
     TrlTestCase,
     require_bitsandbytes,
     require_liger_kernel,
     require_no_wandb,
     require_peft,
+    require_torch_accelerator,
     require_torch_gpu_if_bnb_not_multi_backend_enabled,
     require_vision,
 )
@@ -47,6 +58,9 @@
 if is_vision_available():
     from PIL import Image
 
+if is_peft_available():
+    from peft import LoraConfig, PeftModel
+
 
 class TestTokenizeRow(TrlTestCase):
     def setup_method(self):
@@ -1526,3 +1540,187 @@ def test_f_divergence_type(self, f_divergence_type, as_string: bool):
         # Serialization: TrainingArguments.to_dict should yield the enum's string value
         configparser_dict = training_args.to_dict()
         assert configparser_dict["f_divergence_type"] == f_divergence_type.value
+
+
+# Slow tests moved from tests/slow/test_dpo_slow.py
+
+
+@pytest.mark.slow
+@require_torch_accelerator
+@require_peft
+class TestDPOTrainerSlow(TrlTestCase):
+    def setup_method(self):
+        self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
+        self.peft_config = LoraConfig(
+            lora_alpha=16,
+            lora_dropout=0.1,
+            r=8,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        self.max_length = 128
+
+    def teardown_method(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
+    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
+    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits):
+        """
+        A test that tests the simple usage of `DPOTrainer` using a bare model in full precision.
+        """
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        training_args = DPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=2,
+            max_steps=2,
+            remove_unused_columns=False,
+            gradient_accumulation_steps=2,
+            learning_rate=9e-1,
+            eval_strategy="steps",
+            fp16=True,
+            logging_strategy="no",
+            report_to="none",
+            beta=0.1,
+            loss_type=loss_type,
+            precompute_ref_log_probs=pre_compute_logits,
+            max_length=self.max_length,
+        )
+
+        # dpo train lora model
+        trainer = DPOTrainer(
+            model=model,
+            ref_model=None,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            eval_dataset=self.dataset["test"],
+            processing_class=tokenizer,
+        )
+
+        # train the model
+        trainer.train()
+
+        # save trained model or adapter
+        trainer.save_model()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
+    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
+    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    @require_peft
+    def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
+        """
+        A test that tests the simple usage of `DPOTrainer` using a peft model in full precision + different scenarios
+        of gradient checkpointing.
+        """
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        training_args = DPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=2,
+            max_steps=2,
+            remove_unused_columns=False,
+            gradient_accumulation_steps=2,
+            learning_rate=9e-1,
+            eval_strategy="steps",
+            fp16=True,
+            logging_strategy="no",
+            report_to="none",
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+            generate_during_eval=False,
+            loss_type=loss_type,
+            precompute_ref_log_probs=pre_compute_logits,
+            beta=0.1,
+            max_length=self.max_length,
+        )
+
+        # dpo train lora model
+        trainer = DPOTrainer(
+            model=model,
+            ref_model=None,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            eval_dataset=self.dataset["test"],
+            processing_class=tokenizer,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+        assert trainer.ref_model is None
+
+        # train the model
+        trainer.train()
+
+        # save trained model or adapter
+        trainer.save_model()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
+    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
+    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    @require_bitsandbytes
+    @require_peft
+    def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
+        """
+        A test that tests the simple usage of `DPOTrainer` using QLoRA + different scenarios of gradient checkpointing.
+        """
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        training_args = DPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=2,
+            max_steps=2,
+            remove_unused_columns=False,
+            gradient_accumulation_steps=2,
+            learning_rate=9e-1,
+            eval_strategy="steps",
+            fp16=True,
+            logging_strategy="no",
+            report_to="none",
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+            beta=0.1,
+            generate_during_eval=False,
+            loss_type=loss_type,
+            precompute_ref_log_probs=pre_compute_logits,
+            max_length=self.max_length,
+        )
+
+        # dpo train lora model
+        trainer = DPOTrainer(
+            model=model,
+            ref_model=None,
+            args=training_args,
+            train_dataset=self.dataset["train"],
+            eval_dataset=self.dataset["test"],
+            processing_class=tokenizer,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+        assert trainer.ref_model is None
+
+        # train the model
+        trainer.train()
+
+        # save trained model or adapter
+        trainer.save_model()
+
+        release_memory(model, trainer)
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index caf637b69b3..0cb24d60a50 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -12,23 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
+import os
+import warnings
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import torch
-from datasets import load_dataset
+import transformers
+from accelerate.utils.memory import release_memory
+from datasets import Dataset, Features, Image, Value, load_dataset
+from packaging.version import Version
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForSequenceClassification,
+    AutoProcessor,
     AutoTokenizer,
+    BitsAndBytesConfig,
 )
+from transformers.testing_utils import backend_empty_cache, torch_device
 from transformers.utils import is_peft_available
 
 from trl import GRPOConfig, GRPOTrainer
 from trl.experimental.gspo_token import GRPOTrainer as GSPOTokenTrainer
-
-from .testing_utils import TrlTestCase, require_liger_kernel, require_peft, require_vision, require_vllm
+from trl.trainer.utils import get_kbit_device_map
+
+from .slow.testing_constants import MODELS_TO_TEST
+from .testing_utils import (
+    TrlTestCase,
+    require_bitsandbytes,
+    require_flash_attn,
+    require_liger_kernel,
+    require_peft,
+    require_torch_accelerator,
+    require_vision,
+    require_vllm,
+)
 
 
 if is_peft_available():
@@ -1750,3 +1771,506 @@ def test_training(self):
         for n, param in previous_trainable_params.items():
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+
+# Slow tests moved from tests/slow/test_grpo_slow.py
+@pytest.mark.slow
+@require_torch_accelerator
+class TestGRPOTrainerSlow(TrlTestCase):
+    def setup_method(self):
+        self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test")
+        self.max_length = 128
+
+    def teardown_method(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_liger_kernel
+    def test_training_with_liger_grpo_loss(self, model_name):
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            use_liger_loss=True,
+            max_completion_length=self.max_length,
+            report_to="none",
+            logging_strategy="no",
+            loss_type="bnpo",  # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        trainer = GRPOTrainer(
+            model=model,
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            processing_class=tokenizer,
+        )
+        from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
+        assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
+
+        previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
+
+        trainer.train()
+
+        for n, param in previous_trainable_params.items():
+            new_param = model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_liger_kernel
+    @require_peft
+    def test_training_with_liger_grpo_loss_and_peft(self, model_name):
+        from peft import LoraConfig, TaskType
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            use_liger_loss=True,
+            max_completion_length=self.max_length,
+            report_to="none",
+            logging_strategy="no",
+            loss_type="bnpo",  # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        # Configure PEFT with LoRA
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False,
+            r=8,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            target_modules=["q_proj", "v_proj"],
+        )
+
+        trainer = GRPOTrainer(
+            model=model,
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            processing_class=tokenizer,
+            peft_config=peft_config,
+        )
+        from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
+        assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
+
+        # Verify PEFT adapter is properly initialized
+        from peft import PeftModel
+
+        assert isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT"
+
+        # Store adapter weights before training
+        previous_trainable_params = {
+            n: param.clone() for n, param in trainer.model.named_parameters() if param.requires_grad
+        }
+        assert len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model"
+
+        trainer.train()
+
+        # Verify adapter weights have changed after training
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    def test_training_with_transformers_paged(self, model_name):
+        """Test that training works with transformers paged implementation (requires GPU)."""
+        if Version(transformers.__version__) < Version("4.57.0"):
+            pytest.xfail("Upstream bug in transformers (GH#40692). Fix merged; awaiting release >= 4.57.0")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # increase the learning rate to speed up the test
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            use_transformers_paged=True,  # Enable transformers paged implementation
+            report_to="none",
+            logging_strategy="no",
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+
+        trainer = GRPOTrainer(
+            model=model,
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=self.train_dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "HuggingFaceTB/SmolVLM-Instruct",  # Only test the smaller model to avoid OOM
+        ],
+    )
+    @require_flash_attn
+    @require_bitsandbytes
+    @require_peft
+    def test_vlm_training(self, model_name):
+        """
+        Test VLM training with aggressive memory optimization.
+
+        This test uses multiple memory reduction techniques:
+        - 4-bit quantization with double quantization
+        - LoRA with very low rank (r=4)
+        - Minimal batch size (1) with gradient accumulation
+        - Small images (64x64 instead of 224x224)
+        - Short sequences (max_completion_length=8)
+        - Only 4 training samples
+        - Only 1 training step
+        - Gradient checkpointing and bfloat16
+        """
+
+        # Create processor once outside the data generator
+        processor = AutoProcessor.from_pretrained(model_name, use_fast=True, padding_side="left")
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is in the image?"},
+                ],
+            },
+        ]
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        def data_gen(num_samples):
+            for _ in range(num_samples):
+                yield {
+                    "prompt": prompt,
+                    "image": np.random.uniform(low=0.0, high=255.0, size=(64, 64, 3)).astype(
+                        np.uint8
+                    ),  # Much smaller images
+                }
+
+        dataset = Dataset.from_generator(
+            data_gen, gen_kwargs={"num_samples": 4}, features=Features(image=Image(), prompt=Value(dtype="string"))
+        )
+        # reduce memory requirements as much as possible
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype="bfloat16",
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_storage="bfloat16",
+        )
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_name,
+            attn_implementation="flash_attention_2",
+            dtype="bfloat16",
+            device_map=get_kbit_device_map(),
+            quantization_config=quantization_config,
+        )
+
+        def reward_func(prompts, completions, **kwargs):
+            # simple nonsensical reward
+            return [-((len(c) - 25) ** 2) + 100 for c in completions]
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,
+            per_device_train_batch_size=1,  # Minimal batch size
+            gradient_accumulation_steps=2,  # Maintain effective batch size
+            num_generations=2,
+            max_completion_length=8,  # Much shorter completions
+            max_prompt_length=None,  # Don't limit prompt length for VLM
+            bf16=True,  # Use bfloat16 precision
+            max_steps=1,  # Only do 1 training step to save time and memory
+            report_to="none",
+            logging_strategy="no",
+        )
+        lora_config = LoraConfig(
+            task_type="CAUSAL_LM",
+            r=4,  # Much lower rank for minimal memory
+            lora_alpha=8,  # Reduced alpha proportionally
+            lora_dropout=0.1,
+            target_modules=["q_proj", "v_proj"],  # Minimal target modules
+            # For VLM models, we typically want to freeze the vision encoder
+            # and only adapt the language model parameters
+            modules_to_save=None,
+        )
+
+        try:
+            trainer = GRPOTrainer(
+                model=model,
+                processing_class=processor,
+                reward_funcs=[reward_func],
+                args=training_args,
+                train_dataset=dataset,
+                peft_config=lora_config,
+            )
+
+            assert isinstance(trainer.model, PeftModel)
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+            # Check that LoRA parameters have changed
+            # For VLM models, we're more permissive about which parameters can change
+            lora_params_changed = False
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                if "lora" in n.lower():  # LoRA parameters should change
+                    if not torch.equal(param, new_param):
+                        lora_params_changed = True
+
+            # At least some LoRA parameters should have changed during training
+            assert lora_params_changed, "No LoRA parameters were updated during training."
+
+        except torch.OutOfMemoryError as e:
+            pytest.skip(f"Skipping VLM training test due to insufficient GPU memory: {e}")
+        except Exception as e:
+            # Check for other memory-related errors
+            if any(keyword in str(e).lower() for keyword in ["memory", "cuda", "out of memory", "insufficient"]):
+                pytest.skip(f"Skipping VLM training test due to hardware constraints: {e}")
+            else:
+                raise
+
+        release_memory(model, trainer)
+
+    @require_vllm
+    @require_bitsandbytes
+    @require_peft
+    def test_vlm_processor_vllm_colocate_mode(self):
+        """
+        Test that VLM processors work with vLLM in colocate mode.
+
+        This test uses multiple memory optimization techniques to ensure it runs on limited hardware:
+        - LoRA (Low-Rank Adaptation) with minimal rank (r=4)
+        - 4-bit quantization with BitsAndBytesConfig
+        - Gradient checkpointing
+        - bfloat16 precision
+        - Minimal batch sizes and sequence lengths
+        - Very low GPU memory utilization (5%)
+        """
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        config = GRPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=1,  # Minimal batch size
+            gradient_accumulation_steps=2,  # Make effective batch size 2, divisible by num_generations
+            num_generations=2,
+            max_completion_length=4,  # Very short completions to reduce memory
+            max_prompt_length=32,  # Very short prompts to reduce memory
+            use_vllm=True,  # Enable vLLM
+            vllm_mode="colocate",  # Use colocate mode to avoid server dependency
+            vllm_gpu_memory_utilization=0.05,  # Use minimal GPU memory (5%)
+            gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
+            bf16=True,  # Use bfloat16 to reduce memory
+            report_to="none",
+            logging_strategy="no",
+        )
+
+        # Create a VLM processor
+        processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", use_fast=True, padding_side="left")
+
+        # Verify processor has both required attributes for VLM detection
+        assert hasattr(processor, "tokenizer")
+        assert hasattr(processor, "image_processor")
+
+        def dummy_reward_func(completions, **kwargs):
+            return [1.0] * len(completions)
+
+        # Use LoRA configuration for memory efficiency
+        lora_config = LoraConfig(
+            r=4,  # Very low rank for minimal memory
+            lora_alpha=8,
+            target_modules=["q_proj", "v_proj"],  # Minimal target modules
+            lora_dropout=0.1,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        # Use 4-bit quantization for further memory reduction
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+
+        original_env = {}
+        required_env_vars = {
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+            "LOCAL_WORLD_SIZE": "1",
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12355",
+        }
+
+        for key, value in required_env_vars.items():
+            original_env[key] = os.environ.get(key)
+            os.environ[key] = value
+
+        try:
+            # Test VLM processor with vLLM colocate mode
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                try:
+                    # Load model with quantization for memory efficiency
+                    model = AutoModelForCausalLM.from_pretrained(
+                        "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+                        quantization_config=quantization_config,
+                        dtype=torch.bfloat16,
+                    )
+
+                    trainer = GRPOTrainer(
+                        model=model,
+                        reward_funcs=dummy_reward_func,
+                        args=config,
+                        train_dataset=dataset,
+                        processing_class=processor,  # VLM processor
+                        peft_config=lora_config,  # Use LoRA for memory efficiency
+                    )
+
+                    # Should detect VLM processor correctly and allow vLLM
+                    assert trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode"
+                    assert trainer.vllm_mode == "colocate", "Should use colocate mode"
+
+                    # Check if signature columns were set properly
+                    if trainer._signature_columns is not None:
+                        # Should include 'image' in signature columns for VLM processors
+                        assert "image" in trainer._signature_columns, (
+                            "Should include 'image' in signature columns for VLM"
+                        )
+
+                    # Should not emit any warnings about VLM incompatibility
+                    incompatibility_warnings = [
+                        str(w_item.message)
+                        for w_item in w
+                        if "does not support VLMs" in str(w_item.message)
+                        or "not compatible" in str(w_item.message).lower()
+                    ]
+                    assert len(incompatibility_warnings) == 0, (
+                        f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}"
+                    )
+
+                    # Test passes if we get this far without exceptions
+
+                except Exception as e:
+                    # If vLLM fails to initialize due to hardware constraints or other issues, that's expected
+                    if any(
+                        keyword in str(e).lower()
+                        for keyword in [
+                            "outofmemoryerror",
+                            "cuda",
+                            "memory",
+                            "insufficient",
+                            "no such device",
+                            "free memory",
+                            "gpu memory utilization",
+                            "decrease gpu memory",
+                        ]
+                    ):
+                        pytest.skip(f"Skipping vLLM colocate test due to hardware constraints: {e}")
+                    elif "KeyError" in str(e) and "RANK" in str(e):
+                        pytest.skip(f"Skipping vLLM colocate test due to environment setup issues: {e}")
+                    elif "ValueError" in str(e) and "memory" in str(e).lower():
+                        pytest.skip(f"Skipping vLLM colocate test due to memory constraints: {e}")
+                    else:
+                        raise
+        finally:
+            # Restore original environment variables
+            for key, original_value in original_env.items():
+                if original_value is None:
+                    os.environ.pop(key, None)
+                else:
+                    os.environ[key] = original_value
+
+            release_memory(model, trainer)
+
+    @require_vllm
+    def test_training_vllm(self):
+        """Test that training works with vLLM for generation."""
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # increase the learning rate to speed up the test
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            report_to="none",
+            logging_strategy="no",
+            use_vllm=True,
+        )
+
+        try:
+            trainer = GRPOTrainer(
+                model="Qwen/Qwen2.5-0.5B-Instruct",  # tiny models are too small for vLLM
+                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+                args=training_args,
+                train_dataset=dataset,
+            )
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+            # Check that the params have changed
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+        except Exception as e:
+            # If vLLM fails to initialize due to hardware constraints or other issues, that's expected
+            if any(
+                keyword in str(e).lower()
+                for keyword in [
+                    "outofmemoryerror",
+                    "cuda",
+                    "memory",
+                    "insufficient",
+                    "no such device",
+                    "free memory",
+                    "gpu memory utilization",
+                    "decrease gpu memory",
+                ]
+            ):
+                pytest.skip(f"Skipping vLLM training test due to hardware constraints: {e}")
+            elif "KeyError" in str(e) and "RANK" in str(e):
+                pytest.skip(f"Skipping vLLM training test due to environment setup issues: {e}")
+            elif "ValueError" in str(e) and "memory" in str(e).lower():
+                pytest.skip(f"Skipping vLLM training test due to memory constraints: {e}")
+            else:
+                raise
+
+        release_memory(trainer.model, trainer)
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index c8c22b93987..3ac03d15e63 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -12,20 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import pathlib
 from unittest.mock import MagicMock
 
 import pytest
 import torch
 import transformers
+from accelerate.utils.memory import release_memory
 from datasets import load_dataset
 from packaging.version import parse as parse_version
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers.testing_utils import backend_empty_cache, torch_device
 from transformers.utils import is_peft_available
 
 from trl import SFTConfig, SFTTrainer
 from trl.trainer.sft_trainer import DataCollatorForLanguageModeling, dft_loss
 
+from .slow.testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS
 from .testing_utils import (
     TrlTestCase,
     ignore_warnings,
@@ -33,6 +37,8 @@
     require_flash_attn,
     require_liger_kernel,
     require_peft,
+    require_torch_accelerator,
+    require_torch_multi_accelerator,
     require_vision,
 )
 
@@ -1723,3 +1729,432 @@ def test_prompt_tuning_peft_model(self):
                 assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
             else:
                 raise ValueError(f"Unexpected parameter {n} in model: {trainer.model}")
+
+
+# Slow tests moved from tests/slow/test_sft_slow.py
+@pytest.mark.slow
+@require_torch_accelerator
+@require_peft
+class TestSFTTrainerSlow(TrlTestCase):
+    def setup_method(self):
+        self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]")
+        self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]")
+        self.max_length = 128
+        self.peft_config = LoraConfig(
+            lora_alpha=16,
+            lora_dropout=0.1,
+            r=8,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+    def teardown_method(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    def test_sft_trainer_str(self, model_name, packing):
+        """
+        Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+        )
+
+        trainer = SFTTrainer(
+            model_name,
+            args=training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        trainer.train()
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    def test_sft_trainer_transformers(self, model_name, packing):
+        """
+        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_peft
+    def test_sft_trainer_peft(self, model_name, packing):
+        """
+        Simply tests if passing a transformers model + peft config to `SFTTrainer` loads and runs the trainer as
+        expected.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            fp16=True,
+            packing=packing,
+            max_length=self.max_length,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    def test_sft_trainer_transformers_mp(self, model_name, packing):
+        """
+        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
+        precision.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            fp16=True,  # this is sufficient to enable amp
+            packing=packing,
+            max_length=self.max_length,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs):
+        """
+        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
+        precision + different scenarios of gradient_checkpointing.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+            fp16=True,  # this is sufficient to enable amp
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_peft
+    def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs):
+        """
+        Simply tests if passing a transformers model + PEFT to `SFTTrainer` loads and runs the trainer as expected in
+        mixed precision + different scenarios of gradient_checkpointing.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+            fp16=True,  # this is sufficient to enable amp
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS)
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_torch_multi_accelerator
+    def test_sft_trainer_transformers_mp_gc_device_map(
+        self, model_name, packing, gradient_checkpointing_kwargs, device_map
+    ):
+        """
+        Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
+        precision + different scenarios of gradient_checkpointing (single, multi-gpu, etc).
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+            fp16=True,  # this is sufficient to enable amp
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_peft
+    @require_bitsandbytes
+    def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs):
+        """
+        Simply tests if passing a transformers model + PEFT + bnb to `SFTTrainer` loads and runs the trainer as
+        expected in mixed precision + different scenarios of gradient_checkpointing.
+        """
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            packing=packing,
+            max_length=self.max_length,
+            fp16=True,  # this is sufficient to enable amp
+            gradient_checkpointing=True,
+            gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
+        )
+
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_peft
+    @require_bitsandbytes
+    def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
+        """
+        Simply tests if using setup_chat_format with a transformers model + peft + bnb config to `SFTTrainer` loads and
+        runs the trainer as expected.
+        """
+        train_dataset = load_dataset("trl-internal-testing/dolly-chatml-sft", split="train")
+
+        training_args = SFTConfig(
+            packing=packing,
+            max_length=self.max_length,
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=10,
+            fp16=True,
+        )
+
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        trainer = SFTTrainer(
+            model,
+            args=training_args,
+            processing_class=tokenizer,
+            train_dataset=train_dataset,
+            peft_config=self.peft_config,
+        )
+
+        assert isinstance(trainer.model, PeftModel)
+
+        trainer.train()
+
+        release_memory(model, trainer)
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_liger_kernel
+    def test_sft_trainer_with_liger(self, model_name, packing):
+        """
+        Tests if passing use_liger=True to SFTConfig loads and runs the trainer with AutoLigerKernelForCausalLM as
+        expected.
+        """
+        import importlib
+
+        def cleanup_liger_patches(trainer):
+            """Clean up liger_kernel patches by reloading the model's specific module"""
+            try:
+                # Get the specific module that was used by the trainer's model
+                module_path = trainer.model.__module__
+                reload_module = importlib.import_module(module_path)
+                importlib.reload(reload_module)
+            except Exception:
+                pass  # Continue if reload fails
+
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            logging_strategy="no",
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=2,
+            packing=packing,
+            max_length=self.max_length,
+            use_liger_kernel=True,
+        )
+
+        trainer = SFTTrainer(
+            model_name,
+            args=training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+
+        # Ensure cleanup of liger patches after the test
+        try:
+            trainer.train()
+            release_memory(trainer.model, trainer)
+        finally:
+            cleanup_liger_patches(trainer)
+
+    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
+    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @require_torch_accelerator
+    def test_train_offloading(self, model_name, packing):
+        """Test that activation offloading works with SFTTrainer."""
+        # Initialize the trainer
+        training_args = SFTConfig(
+            output_dir=self.tmp_dir,
+            activation_offloading=True,
+            report_to="none",
+            per_device_train_batch_size=2,
+            max_steps=2,
+            packing=packing,
+            max_length=self.max_length,
+        )
+        trainer = SFTTrainer(
+            model=model_name, args=training_args, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset
+        )
+
+        # Save the initial parameters to compare them later
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        # Train the model
+        trainer.train()
+
+        # Check that the training loss is not None
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
+
+        release_memory(trainer.model, trainer)

From 73411d28b8a44fb5981fd73e9d048bdd363c8b4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 31 Oct 2025 01:23:53 +0000
Subject: [PATCH 2/3] drop testing contants

---
 .../test_grpo_with_replay_buffer_trainer.py   |  14 ++
 tests/slow/__init__.py                        |  13 --
 tests/slow/testing_constants.py               |  26 ----
 tests/test_dpo_trainer.py                     |  53 ++++---
 tests/test_grpo_trainer.py                    |  25 +++-
 tests/test_sft_trainer.py                     | 129 ++++++++++++++----
 6 files changed, 169 insertions(+), 91 deletions(-)
 delete mode 100644 tests/slow/__init__.py
 delete mode 100644 tests/slow/testing_constants.py

diff --git a/tests/experimental/test_grpo_with_replay_buffer_trainer.py b/tests/experimental/test_grpo_with_replay_buffer_trainer.py
index cad66f8034c..26d8ab7e7ba 100644
--- a/tests/experimental/test_grpo_with_replay_buffer_trainer.py
+++ b/tests/experimental/test_grpo_with_replay_buffer_trainer.py
@@ -1,3 +1,17 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 import torch
 from datasets import load_dataset
diff --git a/tests/slow/__init__.py b/tests/slow/__init__.py
deleted file mode 100644
index a3170185781..00000000000
--- a/tests/slow/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/slow/testing_constants.py b/tests/slow/testing_constants.py
deleted file mode 100644
index 1dc30320c7f..00000000000
--- a/tests/slow/testing_constants.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-MODELS_TO_TEST = [
-    "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
-    "trl-internal-testing/tiny-MistralForCausalLM-0.2",
-]
-
-# We could have also not declared these variables but let's be verbose
-PACKING_OPTIONS = [True, False]
-GRADIENT_CHECKPOINTING_KWARGS = [None, {"use_reentrant": False}, {"use_reentrant": True}]
-DEVICE_MAP_OPTIONS = [{"": 0}, "auto"]
-
-DPO_LOSS_TYPES = ["sigmoid", "ipo"]
-DPO_PRECOMPUTE_LOGITS = [True, False]
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index 212ba644591..d729ffc09cd 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -37,12 +37,6 @@
 
 from trl import DPOConfig, DPOTrainer, FDivergenceType
 
-from .slow.testing_constants import (
-    DPO_LOSS_TYPES,
-    DPO_PRECOMPUTE_LOGITS,
-    GRADIENT_CHECKPOINTING_KWARGS,
-    MODELS_TO_TEST,
-)
 from .testing_utils import (
     TrlTestCase,
     require_bitsandbytes,
@@ -1542,9 +1536,6 @@ def test_f_divergence_type(self, f_divergence_type, as_string: bool):
         assert configparser_dict["f_divergence_type"] == f_divergence_type.value
 
 
-# Slow tests moved from tests/slow/test_dpo_slow.py
-
-
 @pytest.mark.slow
 @require_torch_accelerator
 @require_peft
@@ -1565,9 +1556,15 @@ def teardown_method(self):
         backend_empty_cache(torch_device)
         gc.collect()
 
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    @pytest.mark.parametrize("pre_compute_logits", [True, False])
+    @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"])
+    @pytest.mark.parametrize(
+        "model_id",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits):
         """
         A test that tests the simple usage of `DPOTrainer` using a bare model in full precision.
@@ -1611,10 +1608,18 @@ def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("pre_compute_logits", [True, False])
+    @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"])
+    @pytest.mark.parametrize(
+        "model_id",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_peft
     def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
         """
@@ -1667,10 +1672,18 @@ def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS)
-    @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES)
-    @pytest.mark.parametrize("model_id", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("pre_compute_logits", [True, False])
+    @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"])
+    @pytest.mark.parametrize(
+        "model_id",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_bitsandbytes
     @require_peft
     def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs):
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 0cb24d60a50..a5a39d86642 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -39,7 +39,6 @@
 from trl.experimental.gspo_token import GRPOTrainer as GSPOTokenTrainer
 from trl.trainer.utils import get_kbit_device_map
 
-from .slow.testing_constants import MODELS_TO_TEST
 from .testing_utils import (
     TrlTestCase,
     require_bitsandbytes,
@@ -1787,7 +1786,13 @@ def teardown_method(self):
         backend_empty_cache(torch_device)
         gc.collect()
 
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_liger_kernel
     def test_training_with_liger_grpo_loss(self, model_name):
         training_args = GRPOConfig(
@@ -1827,7 +1832,13 @@ def test_training_with_liger_grpo_loss(self, model_name):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_liger_kernel
     @require_peft
     def test_training_with_liger_grpo_loss_and_peft(self, model_name):
@@ -1891,7 +1902,13 @@ def test_training_with_liger_grpo_loss_and_peft(self, model_name):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_training_with_transformers_paged(self, model_name):
         """Test that training works with transformers paged implementation (requires GPU)."""
         if Version(transformers.__version__) < Version("4.57.0"):
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index dcced1b4f99..d1b03bd06d1 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -29,7 +29,6 @@
 from trl import SFTConfig, SFTTrainer
 from trl.trainer.sft_trainer import DataCollatorForLanguageModeling, dft_loss
 
-from .slow.testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS
 from .testing_utils import (
     TrlTestCase,
     ignore_warnings,
@@ -1755,8 +1754,14 @@ def teardown_method(self):
         backend_empty_cache(torch_device)
         gc.collect()
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_sft_trainer_str(self, model_name, packing):
         """
         Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected.
@@ -1780,8 +1785,14 @@ def test_sft_trainer_str(self, model_name, packing):
 
         trainer.train()
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_sft_trainer_transformers(self, model_name, packing):
         """
         Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected.
@@ -1811,8 +1822,14 @@ def test_sft_trainer_transformers(self, model_name, packing):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_peft
     def test_sft_trainer_peft(self, model_name, packing):
         """
@@ -1848,8 +1865,14 @@ def test_sft_trainer_peft(self, model_name, packing):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_sft_trainer_transformers_mp(self, model_name, packing):
         """
         Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
@@ -1881,9 +1904,17 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs):
         """
         Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed
@@ -1917,9 +1948,17 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_peft
     def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs):
         """
@@ -1957,10 +1996,18 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS)
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("device_map", [{"": 0}, "auto"])
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_torch_multi_accelerator
     def test_sft_trainer_transformers_mp_gc_device_map(
         self, model_name, packing, gradient_checkpointing_kwargs, device_map
@@ -1997,9 +2044,17 @@ def test_sft_trainer_transformers_mp_gc_device_map(
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS)
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize(
+        "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}]
+    )
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_peft
     @require_bitsandbytes
     def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs):
@@ -2040,8 +2095,14 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_peft
     @require_bitsandbytes
     def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
@@ -2081,8 +2142,14 @@ def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
 
         release_memory(model, trainer)
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_liger_kernel
     def test_sft_trainer_with_liger(self, model_name, packing):
         """
@@ -2126,8 +2193,14 @@ def cleanup_liger_patches(trainer):
         finally:
             cleanup_liger_patches(trainer)
 
-    @pytest.mark.parametrize("packing", PACKING_OPTIONS)
-    @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+    @pytest.mark.parametrize("packing", [True, False])
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
+            "trl-internal-testing/tiny-MistralForCausalLM-0.2",
+        ],
+    )
     @require_torch_accelerator
     def test_train_offloading(self, model_name, packing):
         """Test that activation offloading works with SFTTrainer."""

From 8705710814df3be2437c9f1012ce0349f8d1c636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 31 Oct 2025 01:24:44 +0000
Subject: [PATCH 3/3] remove comments

---
 tests/test_grpo_trainer.py | 1 -
 tests/test_sft_trainer.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index a5a39d86642..7e42dbb7a6e 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1772,7 +1772,6 @@ def test_training(self):
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
 
-# Slow tests moved from tests/slow/test_grpo_slow.py
 @pytest.mark.slow
 @require_torch_accelerator
 class TestGRPOTrainerSlow(TrlTestCase):
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index d1b03bd06d1..7fe7da9dd07 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1732,7 +1732,6 @@ def test_prompt_tuning_peft_model(self):
                 raise ValueError(f"Unexpected parameter {n} in model: {trainer.model}")
 
 
-# Slow tests moved from tests/slow/test_sft_slow.py
 @pytest.mark.slow
 @require_torch_accelerator
 @require_peft