From 288134085f6a788f3551b15474719e1f9c7a8ebc Mon Sep 17 00:00:00 2001 From: Ishita Bhattacharyya Date: Fri, 31 Oct 2025 01:51:22 +0530 Subject: [PATCH 1/3] Consolidate slow tests into main test files Moved slow test cases from tests/slow/ directory into their respective main test files and added missing imports (gc, BitsAndBytesConfig). --- tests/slow/test_dpo_slow.py | 213 -------------- tests/slow/test_grpo_slow.py | 554 ----------------------------------- tests/slow/test_sft_slow.py | 467 ----------------------------- tests/test_dpo_trainer.py | 200 ++++++++++++- tests/test_grpo_trainer.py | 530 ++++++++++++++++++++++++++++++++- tests/test_sft_trainer.py | 437 ++++++++++++++++++++++++++- 6 files changed, 1162 insertions(+), 1239 deletions(-) delete mode 100644 tests/slow/test_dpo_slow.py delete mode 100644 tests/slow/test_grpo_slow.py delete mode 100755 tests/slow/test_sft_slow.py diff --git a/tests/slow/test_dpo_slow.py b/tests/slow/test_dpo_slow.py deleted file mode 100644 index 03c2c60abc8..00000000000 --- a/tests/slow/test_dpo_slow.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2020-2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import torch -from accelerate.utils.memory import release_memory -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -from transformers.testing_utils import backend_empty_cache, torch_device -from transformers.utils import is_peft_available - -from trl import DPOConfig, DPOTrainer - -from ..testing_utils import TrlTestCase, require_bitsandbytes, require_peft, require_torch_accelerator -from .testing_constants import DPO_LOSS_TYPES, DPO_PRECOMPUTE_LOGITS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST - - -if is_peft_available(): - from peft import LoraConfig, PeftModel - - -@pytest.mark.slow -@require_torch_accelerator -@require_peft -class TestDPOTrainerSlow(TrlTestCase): - def setup_method(self): - self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference") - self.peft_config = LoraConfig( - lora_alpha=16, - lora_dropout=0.1, - r=8, - bias="none", - task_type="CAUSAL_LM", - ) - self.max_length = 128 - - def teardown_method(self): - gc.collect() - backend_empty_cache(torch_device) - gc.collect() - - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) - def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits): - """ - A test that tests the simple usage of `DPOTrainer` using a bare model in full precision. - """ - model = AutoModelForCausalLM.from_pretrained(model_id) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token - - training_args = DPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=2, - max_steps=2, - remove_unused_columns=False, - gradient_accumulation_steps=2, - learning_rate=9e-1, - eval_strategy="steps", - fp16=True, - logging_strategy="no", - report_to="none", - beta=0.1, - loss_type=loss_type, - precompute_ref_log_probs=pre_compute_logits, - max_length=self.max_length, - ) - - # dpo train lora model - trainer = DPOTrainer( - model=model, - ref_model=None, - args=training_args, - train_dataset=self.dataset["train"], - eval_dataset=self.dataset["test"], - processing_class=tokenizer, - ) - - # train the model - trainer.train() - - # save trained model or adapter - trainer.save_model() - - release_memory(model, trainer) - - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) - @require_peft - def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): - """ - A test that tests the simple usage of `DPOTrainer` using a peft model in full precision + different scenarios - of gradient checkpointing. - """ - model = AutoModelForCausalLM.from_pretrained(model_id) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token - - training_args = DPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=2, - max_steps=2, - remove_unused_columns=False, - gradient_accumulation_steps=2, - learning_rate=9e-1, - eval_strategy="steps", - fp16=True, - logging_strategy="no", - report_to="none", - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - generate_during_eval=False, - loss_type=loss_type, - precompute_ref_log_probs=pre_compute_logits, - beta=0.1, - max_length=self.max_length, - ) - - # dpo train lora model - trainer = DPOTrainer( - model=model, - ref_model=None, - args=training_args, - train_dataset=self.dataset["train"], - eval_dataset=self.dataset["test"], - processing_class=tokenizer, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - assert trainer.ref_model is None - - # train the model - trainer.train() - - # save trained model or adapter - trainer.save_model() - - release_memory(model, trainer) - - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) - @require_bitsandbytes - @require_peft - def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): - """ - A test that tests the simple usage of `DPOTrainer` using QLoRA + different scenarios of gradient checkpointing. - """ - quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) - - model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token - - training_args = DPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=2, - max_steps=2, - remove_unused_columns=False, - gradient_accumulation_steps=2, - learning_rate=9e-1, - eval_strategy="steps", - fp16=True, - logging_strategy="no", - report_to="none", - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - beta=0.1, - generate_during_eval=False, - loss_type=loss_type, - precompute_ref_log_probs=pre_compute_logits, - max_length=self.max_length, - ) - - # dpo train lora model - trainer = DPOTrainer( - model=model, - ref_model=None, - args=training_args, - train_dataset=self.dataset["train"], - eval_dataset=self.dataset["test"], - processing_class=tokenizer, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - assert trainer.ref_model is None - - # train the model - trainer.train() - - # save trained model or adapter - trainer.save_model() - - release_memory(model, trainer) diff --git a/tests/slow/test_grpo_slow.py b/tests/slow/test_grpo_slow.py deleted file mode 100644 index bf63984d645..00000000000 --- a/tests/slow/test_grpo_slow.py +++ /dev/null @@ -1,554 +0,0 @@ -# Copyright 2020-2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import os -import warnings - -import numpy as np -import pytest -import torch -import transformers -from accelerate.utils.memory import release_memory -from datasets import Dataset, Features, Image, Value, load_dataset -from packaging.version import Version -from transformers import ( - AutoModelForCausalLM, - AutoModelForImageTextToText, - AutoProcessor, - AutoTokenizer, - BitsAndBytesConfig, -) -from transformers.testing_utils import backend_empty_cache, torch_device -from transformers.utils import is_peft_available - -from trl import GRPOConfig, GRPOTrainer -from trl.trainer.utils import get_kbit_device_map - -from ..testing_utils import ( - TrlTestCase, - require_bitsandbytes, - require_flash_attn, - require_liger_kernel, - require_peft, - require_torch_accelerator, - require_vllm, -) -from .testing_constants import MODELS_TO_TEST - - -if is_peft_available(): - from peft import LoraConfig, PeftModel - - -@pytest.mark.slow -@require_torch_accelerator -class TestGRPOTrainerSlow(TrlTestCase): - def setup_method(self): - self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") - self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test") - self.max_length = 128 - - def teardown_method(self): - gc.collect() - backend_empty_cache(torch_device) - gc.collect() - - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_liger_kernel - def test_training_with_liger_grpo_loss(self, model_name): - training_args = GRPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=3, - num_generations=3, - use_liger_loss=True, - max_completion_length=self.max_length, - report_to="none", - logging_strategy="no", - loss_type="bnpo", # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620 - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token - - trainer = GRPOTrainer( - model=model, - reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", - args=training_args, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - processing_class=tokenizer, - ) - from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss - - assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss) - - previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()} - - trainer.train() - - for n, param in previous_trainable_params.items(): - new_param = model.get_parameter(n) - assert not torch.equal(param, new_param), f"Parameter {n} has not changed." - - release_memory(model, trainer) - - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_liger_kernel - @require_peft - def test_training_with_liger_grpo_loss_and_peft(self, model_name): - from peft import LoraConfig, TaskType - - training_args = GRPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=3, - num_generations=3, - use_liger_loss=True, - max_completion_length=self.max_length, - report_to="none", - logging_strategy="no", - loss_type="bnpo", # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620 - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token - - # Configure PEFT with LoRA - peft_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, - inference_mode=False, - r=8, - lora_alpha=32, - lora_dropout=0.1, - target_modules=["q_proj", "v_proj"], - ) - - trainer = GRPOTrainer( - model=model, - reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", - args=training_args, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - processing_class=tokenizer, - peft_config=peft_config, - ) - from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss - - assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss) - - # Verify PEFT adapter is properly initialized - from peft import PeftModel - - assert isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT" - - # Store adapter weights before training - previous_trainable_params = { - n: param.clone() for n, param in trainer.model.named_parameters() if param.requires_grad - } - assert len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model" - - trainer.train() - - # Verify adapter weights have changed after training - for n, param in previous_trainable_params.items(): - new_param = trainer.model.get_parameter(n) - assert not torch.equal(param, new_param), f"Parameter {n} has not changed." - - release_memory(model, trainer) - - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - def test_training_with_transformers_paged(self, model_name): - """Test that training works with transformers paged implementation (requires GPU).""" - if Version(transformers.__version__) < Version("4.57.0"): - pytest.xfail("Upstream bug in transformers (GH#40692). Fix merged; awaiting release >= 4.57.0") - training_args = GRPOConfig( - output_dir=self.tmp_dir, - learning_rate=0.1, # increase the learning rate to speed up the test - per_device_train_batch_size=3, # reduce the batch size to reduce memory usage - num_generations=3, # reduce the number of generations to reduce memory usage - max_completion_length=8, # reduce the completion length to reduce memory usage - use_transformers_paged=True, # Enable transformers paged implementation - report_to="none", - logging_strategy="no", - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - - trainer = GRPOTrainer( - model=model, - reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", - args=training_args, - train_dataset=self.train_dataset, - ) - - previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()} - - trainer.train() - - assert trainer.state.log_history[-1]["train_loss"] is not None - - # Check that the params have changed - for n, param in previous_trainable_params.items(): - new_param = model.get_parameter(n) - assert not torch.equal(param, new_param), f"Parameter {n} has not changed." - - release_memory(model, trainer) - - @pytest.mark.parametrize( - "model_name", - [ - "HuggingFaceTB/SmolVLM-Instruct", # Only test the smaller model to avoid OOM - ], - ) - @require_flash_attn - @require_bitsandbytes - @require_peft - def test_vlm_training(self, model_name): - """ - Test VLM training with aggressive memory optimization. - - This test uses multiple memory reduction techniques: - - 4-bit quantization with double quantization - - LoRA with very low rank (r=4) - - Minimal batch size (1) with gradient accumulation - - Small images (64x64 instead of 224x224) - - Short sequences (max_completion_length=8) - - Only 4 training samples - - Only 1 training step - - Gradient checkpointing and bfloat16 - """ - - # Create processor once outside the data generator - processor = AutoProcessor.from_pretrained(model_name, use_fast=True, padding_side="left") - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is in the image?"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - def data_gen(num_samples): - for _ in range(num_samples): - yield { - "prompt": prompt, - "image": np.random.uniform(low=0.0, high=255.0, size=(64, 64, 3)).astype( - np.uint8 - ), # Much smaller images - } - - dataset = Dataset.from_generator( - data_gen, gen_kwargs={"num_samples": 4}, features=Features(image=Image(), prompt=Value(dtype="string")) - ) - # reduce memory requirements as much as possible - quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype="bfloat16", - bnb_4bit_quant_type="nf4", - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_storage="bfloat16", - ) - model = AutoModelForImageTextToText.from_pretrained( - model_name, - attn_implementation="flash_attention_2", - dtype="bfloat16", - device_map=get_kbit_device_map(), - quantization_config=quantization_config, - ) - - def reward_func(prompts, completions, **kwargs): - # simple nonsensical reward - return [-((len(c) - 25) ** 2) + 100 for c in completions] - - training_args = GRPOConfig( - output_dir=self.tmp_dir, - learning_rate=0.1, - per_device_train_batch_size=1, # Minimal batch size - gradient_accumulation_steps=2, # Maintain effective batch size - num_generations=2, - max_completion_length=8, # Much shorter completions - max_prompt_length=None, # Don't limit prompt length for VLM - bf16=True, # Use bfloat16 precision - max_steps=1, # Only do 1 training step to save time and memory - report_to="none", - logging_strategy="no", - ) - lora_config = LoraConfig( - task_type="CAUSAL_LM", - r=4, # Much lower rank for minimal memory - lora_alpha=8, # Reduced alpha proportionally - lora_dropout=0.1, - target_modules=["q_proj", "v_proj"], # Minimal target modules - # For VLM models, we typically want to freeze the vision encoder - # and only adapt the language model parameters - modules_to_save=None, - ) - - try: - trainer = GRPOTrainer( - model=model, - processing_class=processor, - reward_funcs=[reward_func], - args=training_args, - train_dataset=dataset, - peft_config=lora_config, - ) - - assert isinstance(trainer.model, PeftModel) - - previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} - - trainer.train() - - assert trainer.state.log_history[-1]["train_loss"] is not None - - # Check that LoRA parameters have changed - # For VLM models, we're more permissive about which parameters can change - lora_params_changed = False - for n, param in previous_trainable_params.items(): - new_param = trainer.model.get_parameter(n) - if "lora" in n.lower(): # LoRA parameters should change - if not torch.equal(param, new_param): - lora_params_changed = True - - # At least some LoRA parameters should have changed during training - assert lora_params_changed, "No LoRA parameters were updated during training." - - except torch.OutOfMemoryError as e: - pytest.skip(f"Skipping VLM training test due to insufficient GPU memory: {e}") - except Exception as e: - # Check for other memory-related errors - if any(keyword in str(e).lower() for keyword in ["memory", "cuda", "out of memory", "insufficient"]): - pytest.skip(f"Skipping VLM training test due to hardware constraints: {e}") - else: - raise - - release_memory(model, trainer) - - @require_vllm - @require_bitsandbytes - @require_peft - def test_vlm_processor_vllm_colocate_mode(self): - """ - Test that VLM processors work with vLLM in colocate mode. - - This test uses multiple memory optimization techniques to ensure it runs on limited hardware: - - LoRA (Low-Rank Adaptation) with minimal rank (r=4) - - 4-bit quantization with BitsAndBytesConfig - - Gradient checkpointing - - bfloat16 precision - - Minimal batch sizes and sequence lengths - - Very low GPU memory utilization (5%) - """ - dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") - - config = GRPOConfig( - output_dir=self.tmp_dir, - per_device_train_batch_size=1, # Minimal batch size - gradient_accumulation_steps=2, # Make effective batch size 2, divisible by num_generations - num_generations=2, - max_completion_length=4, # Very short completions to reduce memory - max_prompt_length=32, # Very short prompts to reduce memory - use_vllm=True, # Enable vLLM - vllm_mode="colocate", # Use colocate mode to avoid server dependency - vllm_gpu_memory_utilization=0.05, # Use minimal GPU memory (5%) - gradient_checkpointing=True, # Enable gradient checkpointing to save memory - bf16=True, # Use bfloat16 to reduce memory - report_to="none", - logging_strategy="no", - ) - - # Create a VLM processor - processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", use_fast=True, padding_side="left") - - # Verify processor has both required attributes for VLM detection - assert hasattr(processor, "tokenizer") - assert hasattr(processor, "image_processor") - - def dummy_reward_func(completions, **kwargs): - return [1.0] * len(completions) - - # Use LoRA configuration for memory efficiency - lora_config = LoraConfig( - r=4, # Very low rank for minimal memory - lora_alpha=8, - target_modules=["q_proj", "v_proj"], # Minimal target modules - lora_dropout=0.1, - bias="none", - task_type="CAUSAL_LM", - ) - - # Use 4-bit quantization for further memory reduction - quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_quant_type="nf4", - bnb_4bit_use_double_quant=True, - ) - - original_env = {} - required_env_vars = { - "RANK": "0", - "LOCAL_RANK": "0", - "WORLD_SIZE": "1", - "LOCAL_WORLD_SIZE": "1", - "MASTER_ADDR": "localhost", - "MASTER_PORT": "12355", - } - - for key, value in required_env_vars.items(): - original_env[key] = os.environ.get(key) - os.environ[key] = value - - try: - # Test VLM processor with vLLM colocate mode - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - try: - # Load model with quantization for memory efficiency - model = AutoModelForCausalLM.from_pretrained( - "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", - quantization_config=quantization_config, - dtype=torch.bfloat16, - ) - - trainer = GRPOTrainer( - model=model, - reward_funcs=dummy_reward_func, - args=config, - train_dataset=dataset, - processing_class=processor, # VLM processor - peft_config=lora_config, # Use LoRA for memory efficiency - ) - - # Should detect VLM processor correctly and allow vLLM - assert trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode" - assert trainer.vllm_mode == "colocate", "Should use colocate mode" - - # Check if signature columns were set properly - if trainer._signature_columns is not None: - # Should include 'image' in signature columns for VLM processors - assert "image" in trainer._signature_columns, ( - "Should include 'image' in signature columns for VLM" - ) - - # Should not emit any warnings about VLM incompatibility - incompatibility_warnings = [ - str(w_item.message) - for w_item in w - if "does not support VLMs" in str(w_item.message) - or "not compatible" in str(w_item.message).lower() - ] - assert len(incompatibility_warnings) == 0, ( - f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}" - ) - - # Test passes if we get this far without exceptions - - except Exception as e: - # If vLLM fails to initialize due to hardware constraints or other issues, that's expected - if any( - keyword in str(e).lower() - for keyword in [ - "outofmemoryerror", - "cuda", - "memory", - "insufficient", - "no such device", - "free memory", - "gpu memory utilization", - "decrease gpu memory", - ] - ): - pytest.skip(f"Skipping vLLM colocate test due to hardware constraints: {e}") - elif "KeyError" in str(e) and "RANK" in str(e): - pytest.skip(f"Skipping vLLM colocate test due to environment setup issues: {e}") - elif "ValueError" in str(e) and "memory" in str(e).lower(): - pytest.skip(f"Skipping vLLM colocate test due to memory constraints: {e}") - else: - raise - finally: - # Restore original environment variables - for key, original_value in original_env.items(): - if original_value is None: - os.environ.pop(key, None) - else: - os.environ[key] = original_value - - release_memory(model, trainer) - - @require_vllm - def test_training_vllm(self): - """Test that training works with vLLM for generation.""" - dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") - - training_args = GRPOConfig( - output_dir=self.tmp_dir, - learning_rate=0.1, # increase the learning rate to speed up the test - per_device_train_batch_size=3, # reduce the batch size to reduce memory usage - num_generations=3, # reduce the number of generations to reduce memory usage - max_completion_length=8, # reduce the completion length to reduce memory usage - report_to="none", - logging_strategy="no", - use_vllm=True, - ) - - try: - trainer = GRPOTrainer( - model="Qwen/Qwen2.5-0.5B-Instruct", # tiny models are too small for vLLM - reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", - args=training_args, - train_dataset=dataset, - ) - - previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} - - trainer.train() - - assert trainer.state.log_history[-1]["train_loss"] is not None - - # Check that the params have changed - for n, param in previous_trainable_params.items(): - new_param = trainer.model.get_parameter(n) - assert not torch.equal(param, new_param), f"Parameter {n} has not changed." - - except Exception as e: - # If vLLM fails to initialize due to hardware constraints or other issues, that's expected - if any( - keyword in str(e).lower() - for keyword in [ - "outofmemoryerror", - "cuda", - "memory", - "insufficient", - "no such device", - "free memory", - "gpu memory utilization", - "decrease gpu memory", - ] - ): - pytest.skip(f"Skipping vLLM training test due to hardware constraints: {e}") - elif "KeyError" in str(e) and "RANK" in str(e): - pytest.skip(f"Skipping vLLM training test due to environment setup issues: {e}") - elif "ValueError" in str(e) and "memory" in str(e).lower(): - pytest.skip(f"Skipping vLLM training test due to memory constraints: {e}") - else: - raise - - release_memory(trainer.model, trainer) diff --git a/tests/slow/test_sft_slow.py b/tests/slow/test_sft_slow.py deleted file mode 100755 index dddf124e947..00000000000 --- a/tests/slow/test_sft_slow.py +++ /dev/null @@ -1,467 +0,0 @@ -# Copyright 2020-2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import torch -from accelerate.utils.memory import release_memory -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -from transformers.testing_utils import backend_empty_cache, torch_device -from transformers.utils import is_peft_available - -from trl import SFTConfig, SFTTrainer - -from ..testing_utils import ( - TrlTestCase, - require_bitsandbytes, - require_liger_kernel, - require_peft, - require_torch_accelerator, - require_torch_multi_accelerator, -) -from .testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS - - -if is_peft_available(): - from peft import LoraConfig, PeftModel - - -@pytest.mark.slow -@require_torch_accelerator -@require_peft -class TestSFTTrainerSlow(TrlTestCase): - def setup_method(self): - self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]") - self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]") - self.max_length = 128 - self.peft_config = LoraConfig( - lora_alpha=16, - lora_dropout=0.1, - r=8, - bias="none", - task_type="CAUSAL_LM", - ) - - def teardown_method(self): - gc.collect() - backend_empty_cache(torch_device) - gc.collect() - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - def test_sft_trainer_str(self, model_name, packing): - """ - Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - ) - - trainer = SFTTrainer( - model_name, - args=training_args, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - trainer.train() - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - def test_sft_trainer_transformers(self, model_name, packing): - """ - Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_peft - def test_sft_trainer_peft(self, model_name, packing): - """ - Simply tests if passing a transformers model + peft config to `SFTTrainer` loads and runs the trainer as - expected. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - fp16=True, - packing=packing, - max_length=self.max_length, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - def test_sft_trainer_transformers_mp(self, model_name, packing): - """ - Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed - precision. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - fp16=True, # this is sufficient to enable amp - packing=packing, - max_length=self.max_length, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs): - """ - Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed - precision + different scenarios of gradient_checkpointing. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - fp16=True, # this is sufficient to enable amp - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_peft - def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs): - """ - Simply tests if passing a transformers model + PEFT to `SFTTrainer` loads and runs the trainer as expected in - mixed precision + different scenarios of gradient_checkpointing. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - fp16=True, # this is sufficient to enable amp - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_torch_multi_accelerator - def test_sft_trainer_transformers_mp_gc_device_map( - self, model_name, packing, gradient_checkpointing_kwargs, device_map - ): - """ - Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed - precision + different scenarios of gradient_checkpointing (single, multi-gpu, etc). - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - fp16=True, # this is sufficient to enable amp - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - ) - - model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_peft - @require_bitsandbytes - def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs): - """ - Simply tests if passing a transformers model + PEFT + bnb to `SFTTrainer` loads and runs the trainer as - expected in mixed precision + different scenarios of gradient_checkpointing. - """ - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - packing=packing, - max_length=self.max_length, - fp16=True, # this is sufficient to enable amp - gradient_checkpointing=True, - gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, - ) - - quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) - - model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_peft - @require_bitsandbytes - def test_sft_trainer_with_chat_format_qlora(self, model_name, packing): - """ - Simply tests if using setup_chat_format with a transformers model + peft + bnb config to `SFTTrainer` loads and - runs the trainer as expected. - """ - train_dataset = load_dataset("trl-internal-testing/dolly-chatml-sft", split="train") - - training_args = SFTConfig( - packing=packing, - max_length=self.max_length, - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=10, - fp16=True, - ) - - quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) - - model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config) - tokenizer = AutoTokenizer.from_pretrained(model_name) - - trainer = SFTTrainer( - model, - args=training_args, - processing_class=tokenizer, - train_dataset=train_dataset, - peft_config=self.peft_config, - ) - - assert isinstance(trainer.model, PeftModel) - - trainer.train() - - release_memory(model, trainer) - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_liger_kernel - def test_sft_trainer_with_liger(self, model_name, packing): - """ - Tests if passing use_liger=True to SFTConfig loads and runs the trainer with AutoLigerKernelForCausalLM as - expected. - """ - import importlib - - def cleanup_liger_patches(trainer): - """Clean up liger_kernel patches by reloading the model's specific module""" - try: - # Get the specific module that was used by the trainer's model - module_path = trainer.model.__module__ - reload_module = importlib.import_module(module_path) - importlib.reload(reload_module) - except Exception: - pass # Continue if reload fails - - training_args = SFTConfig( - output_dir=self.tmp_dir, - logging_strategy="no", - report_to="none", - per_device_train_batch_size=2, - max_steps=2, - packing=packing, - max_length=self.max_length, - use_liger_kernel=True, - ) - - trainer = SFTTrainer( - model_name, - args=training_args, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset, - ) - - # Ensure cleanup of liger patches after the test - try: - trainer.train() - release_memory(trainer.model, trainer) - finally: - cleanup_liger_patches(trainer) - - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) - @require_torch_accelerator - def test_train_offloading(self, model_name, packing): - """Test that activation offloading works with SFTTrainer.""" - # Initialize the trainer - training_args = SFTConfig( - output_dir=self.tmp_dir, - activation_offloading=True, - report_to="none", - per_device_train_batch_size=2, - max_steps=2, - packing=packing, - max_length=self.max_length, - ) - trainer = SFTTrainer( - model=model_name, args=training_args, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset - ) - - # Save the initial parameters to compare them later - previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} - - # Train the model - trainer.train() - - # Check that the training loss is not None - assert trainer.state.log_history[-1]["train_loss"] is not None - - # Check the params have changed - for n, param in previous_trainable_params.items(): - new_param = trainer.model.get_parameter(n) - assert not torch.allclose(param, new_param), f"Parameter {n} has not changed" - - release_memory(trainer.model, trainer) diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index fa7038167d4..212ba644591 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import re import sys from unittest.mock import MagicMock @@ -19,6 +20,7 @@ import numpy as np import pytest import torch +from accelerate.utils.memory import release_memory from datasets import Dataset, features, load_dataset from transformers import ( AutoModelForCausalLM, @@ -26,19 +28,28 @@ AutoModelForSeq2SeqLM, AutoProcessor, AutoTokenizer, + BitsAndBytesConfig, PreTrainedTokenizerBase, is_vision_available, ) -from transformers.testing_utils import get_device_properties +from transformers.testing_utils import backend_empty_cache, get_device_properties, torch_device +from transformers.utils import is_peft_available from trl import DPOConfig, DPOTrainer, FDivergenceType +from .slow.testing_constants import ( + DPO_LOSS_TYPES, + DPO_PRECOMPUTE_LOGITS, + GRADIENT_CHECKPOINTING_KWARGS, + MODELS_TO_TEST, +) from .testing_utils import ( TrlTestCase, require_bitsandbytes, require_liger_kernel, require_no_wandb, require_peft, + require_torch_accelerator, require_torch_gpu_if_bnb_not_multi_backend_enabled, require_vision, ) @@ -47,6 +58,9 @@ if is_vision_available(): from PIL import Image +if is_peft_available(): + from peft import LoraConfig, PeftModel + class TestTokenizeRow(TrlTestCase): def setup_method(self): @@ -1526,3 +1540,187 @@ def test_f_divergence_type(self, f_divergence_type, as_string: bool): # Serialization: TrainingArguments.to_dict should yield the enum's string value configparser_dict = training_args.to_dict() assert configparser_dict["f_divergence_type"] == f_divergence_type.value + + +# Slow tests moved from tests/slow/test_dpo_slow.py + + +@pytest.mark.slow +@require_torch_accelerator +@require_peft +class TestDPOTrainerSlow(TrlTestCase): + def setup_method(self): + self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference") + self.peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=8, + bias="none", + task_type="CAUSAL_LM", + ) + self.max_length = 128 + + def teardown_method(self): + gc.collect() + backend_empty_cache(torch_device) + gc.collect() + + @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) + @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits): + """ + A test that tests the simple usage of `DPOTrainer` using a bare model in full precision. + """ + model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token + + training_args = DPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=2, + max_steps=2, + remove_unused_columns=False, + gradient_accumulation_steps=2, + learning_rate=9e-1, + eval_strategy="steps", + fp16=True, + logging_strategy="no", + report_to="none", + beta=0.1, + loss_type=loss_type, + precompute_ref_log_probs=pre_compute_logits, + max_length=self.max_length, + ) + + # dpo train lora model + trainer = DPOTrainer( + model=model, + ref_model=None, + args=training_args, + train_dataset=self.dataset["train"], + eval_dataset=self.dataset["test"], + processing_class=tokenizer, + ) + + # train the model + trainer.train() + + # save trained model or adapter + trainer.save_model() + + release_memory(model, trainer) + + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) + @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + @require_peft + def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): + """ + A test that tests the simple usage of `DPOTrainer` using a peft model in full precision + different scenarios + of gradient checkpointing. + """ + model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token + + training_args = DPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=2, + max_steps=2, + remove_unused_columns=False, + gradient_accumulation_steps=2, + learning_rate=9e-1, + eval_strategy="steps", + fp16=True, + logging_strategy="no", + report_to="none", + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + generate_during_eval=False, + loss_type=loss_type, + precompute_ref_log_probs=pre_compute_logits, + beta=0.1, + max_length=self.max_length, + ) + + # dpo train lora model + trainer = DPOTrainer( + model=model, + ref_model=None, + args=training_args, + train_dataset=self.dataset["train"], + eval_dataset=self.dataset["test"], + processing_class=tokenizer, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + assert trainer.ref_model is None + + # train the model + trainer.train() + + # save trained model or adapter + trainer.save_model() + + release_memory(model, trainer) + + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) + @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + @require_bitsandbytes + @require_peft + def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): + """ + A test that tests the simple usage of `DPOTrainer` using QLoRA + different scenarios of gradient checkpointing. + """ + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + + model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token + + training_args = DPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=2, + max_steps=2, + remove_unused_columns=False, + gradient_accumulation_steps=2, + learning_rate=9e-1, + eval_strategy="steps", + fp16=True, + logging_strategy="no", + report_to="none", + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + beta=0.1, + generate_during_eval=False, + loss_type=loss_type, + precompute_ref_log_probs=pre_compute_logits, + max_length=self.max_length, + ) + + # dpo train lora model + trainer = DPOTrainer( + model=model, + ref_model=None, + args=training_args, + train_dataset=self.dataset["train"], + eval_dataset=self.dataset["test"], + processing_class=tokenizer, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + assert trainer.ref_model is None + + # train the model + trainer.train() + + # save trained model or adapter + trainer.save_model() + + release_memory(model, trainer) diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index caf637b69b3..0cb24d60a50 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -12,23 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc +import os +import warnings from unittest.mock import patch +import numpy as np import pytest import torch -from datasets import load_dataset +import transformers +from accelerate.utils.memory import release_memory +from datasets import Dataset, Features, Image, Value, load_dataset +from packaging.version import Version from transformers import ( AutoModelForCausalLM, AutoModelForImageTextToText, AutoModelForSequenceClassification, + AutoProcessor, AutoTokenizer, + BitsAndBytesConfig, ) +from transformers.testing_utils import backend_empty_cache, torch_device from transformers.utils import is_peft_available from trl import GRPOConfig, GRPOTrainer from trl.experimental.gspo_token import GRPOTrainer as GSPOTokenTrainer - -from .testing_utils import TrlTestCase, require_liger_kernel, require_peft, require_vision, require_vllm +from trl.trainer.utils import get_kbit_device_map + +from .slow.testing_constants import MODELS_TO_TEST +from .testing_utils import ( + TrlTestCase, + require_bitsandbytes, + require_flash_attn, + require_liger_kernel, + require_peft, + require_torch_accelerator, + require_vision, + require_vllm, +) if is_peft_available(): @@ -1750,3 +1771,506 @@ def test_training(self): for n, param in previous_trainable_params.items(): new_param = trainer.model.get_parameter(n) assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + +# Slow tests moved from tests/slow/test_grpo_slow.py +@pytest.mark.slow +@require_torch_accelerator +class TestGRPOTrainerSlow(TrlTestCase): + def setup_method(self): + self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test") + self.max_length = 128 + + def teardown_method(self): + gc.collect() + backend_empty_cache(torch_device) + gc.collect() + + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_liger_kernel + def test_training_with_liger_grpo_loss(self, model_name): + training_args = GRPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=3, + num_generations=3, + use_liger_loss=True, + max_completion_length=self.max_length, + report_to="none", + logging_strategy="no", + loss_type="bnpo", # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620 + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token + + trainer = GRPOTrainer( + model=model, + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + processing_class=tokenizer, + ) + from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss + + assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss) + + previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()} + + trainer.train() + + for n, param in previous_trainable_params.items(): + new_param = model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + release_memory(model, trainer) + + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_liger_kernel + @require_peft + def test_training_with_liger_grpo_loss_and_peft(self, model_name): + from peft import LoraConfig, TaskType + + training_args = GRPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=3, + num_generations=3, + use_liger_loss=True, + max_completion_length=self.max_length, + report_to="none", + logging_strategy="no", + loss_type="bnpo", # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620 + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token + + # Configure PEFT with LoRA + peft_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + r=8, + lora_alpha=32, + lora_dropout=0.1, + target_modules=["q_proj", "v_proj"], + ) + + trainer = GRPOTrainer( + model=model, + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + processing_class=tokenizer, + peft_config=peft_config, + ) + from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss + + assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss) + + # Verify PEFT adapter is properly initialized + from peft import PeftModel + + assert isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT" + + # Store adapter weights before training + previous_trainable_params = { + n: param.clone() for n, param in trainer.model.named_parameters() if param.requires_grad + } + assert len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model" + + trainer.train() + + # Verify adapter weights have changed after training + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + release_memory(model, trainer) + + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + def test_training_with_transformers_paged(self, model_name): + """Test that training works with transformers paged implementation (requires GPU).""" + if Version(transformers.__version__) < Version("4.57.0"): + pytest.xfail("Upstream bug in transformers (GH#40692). Fix merged; awaiting release >= 4.57.0") + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, # increase the learning rate to speed up the test + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=8, # reduce the completion length to reduce memory usage + use_transformers_paged=True, # Enable transformers paged implementation + report_to="none", + logging_strategy="no", + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + + trainer = GRPOTrainer( + model=model, + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=self.train_dataset, + ) + + previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + + # Check that the params have changed + for n, param in previous_trainable_params.items(): + new_param = model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + release_memory(model, trainer) + + @pytest.mark.parametrize( + "model_name", + [ + "HuggingFaceTB/SmolVLM-Instruct", # Only test the smaller model to avoid OOM + ], + ) + @require_flash_attn + @require_bitsandbytes + @require_peft + def test_vlm_training(self, model_name): + """ + Test VLM training with aggressive memory optimization. + + This test uses multiple memory reduction techniques: + - 4-bit quantization with double quantization + - LoRA with very low rank (r=4) + - Minimal batch size (1) with gradient accumulation + - Small images (64x64 instead of 224x224) + - Short sequences (max_completion_length=8) + - Only 4 training samples + - Only 1 training step + - Gradient checkpointing and bfloat16 + """ + + # Create processor once outside the data generator + processor = AutoProcessor.from_pretrained(model_name, use_fast=True, padding_side="left") + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is in the image?"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + def data_gen(num_samples): + for _ in range(num_samples): + yield { + "prompt": prompt, + "image": np.random.uniform(low=0.0, high=255.0, size=(64, 64, 3)).astype( + np.uint8 + ), # Much smaller images + } + + dataset = Dataset.from_generator( + data_gen, gen_kwargs={"num_samples": 4}, features=Features(image=Image(), prompt=Value(dtype="string")) + ) + # reduce memory requirements as much as possible + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype="bfloat16", + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_storage="bfloat16", + ) + model = AutoModelForImageTextToText.from_pretrained( + model_name, + attn_implementation="flash_attention_2", + dtype="bfloat16", + device_map=get_kbit_device_map(), + quantization_config=quantization_config, + ) + + def reward_func(prompts, completions, **kwargs): + # simple nonsensical reward + return [-((len(c) - 25) ** 2) + 100 for c in completions] + + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, + per_device_train_batch_size=1, # Minimal batch size + gradient_accumulation_steps=2, # Maintain effective batch size + num_generations=2, + max_completion_length=8, # Much shorter completions + max_prompt_length=None, # Don't limit prompt length for VLM + bf16=True, # Use bfloat16 precision + max_steps=1, # Only do 1 training step to save time and memory + report_to="none", + logging_strategy="no", + ) + lora_config = LoraConfig( + task_type="CAUSAL_LM", + r=4, # Much lower rank for minimal memory + lora_alpha=8, # Reduced alpha proportionally + lora_dropout=0.1, + target_modules=["q_proj", "v_proj"], # Minimal target modules + # For VLM models, we typically want to freeze the vision encoder + # and only adapt the language model parameters + modules_to_save=None, + ) + + try: + trainer = GRPOTrainer( + model=model, + processing_class=processor, + reward_funcs=[reward_func], + args=training_args, + train_dataset=dataset, + peft_config=lora_config, + ) + + assert isinstance(trainer.model, PeftModel) + + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + + # Check that LoRA parameters have changed + # For VLM models, we're more permissive about which parameters can change + lora_params_changed = False + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + if "lora" in n.lower(): # LoRA parameters should change + if not torch.equal(param, new_param): + lora_params_changed = True + + # At least some LoRA parameters should have changed during training + assert lora_params_changed, "No LoRA parameters were updated during training." + + except torch.OutOfMemoryError as e: + pytest.skip(f"Skipping VLM training test due to insufficient GPU memory: {e}") + except Exception as e: + # Check for other memory-related errors + if any(keyword in str(e).lower() for keyword in ["memory", "cuda", "out of memory", "insufficient"]): + pytest.skip(f"Skipping VLM training test due to hardware constraints: {e}") + else: + raise + + release_memory(model, trainer) + + @require_vllm + @require_bitsandbytes + @require_peft + def test_vlm_processor_vllm_colocate_mode(self): + """ + Test that VLM processors work with vLLM in colocate mode. + + This test uses multiple memory optimization techniques to ensure it runs on limited hardware: + - LoRA (Low-Rank Adaptation) with minimal rank (r=4) + - 4-bit quantization with BitsAndBytesConfig + - Gradient checkpointing + - bfloat16 precision + - Minimal batch sizes and sequence lengths + - Very low GPU memory utilization (5%) + """ + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + + config = GRPOConfig( + output_dir=self.tmp_dir, + per_device_train_batch_size=1, # Minimal batch size + gradient_accumulation_steps=2, # Make effective batch size 2, divisible by num_generations + num_generations=2, + max_completion_length=4, # Very short completions to reduce memory + max_prompt_length=32, # Very short prompts to reduce memory + use_vllm=True, # Enable vLLM + vllm_mode="colocate", # Use colocate mode to avoid server dependency + vllm_gpu_memory_utilization=0.05, # Use minimal GPU memory (5%) + gradient_checkpointing=True, # Enable gradient checkpointing to save memory + bf16=True, # Use bfloat16 to reduce memory + report_to="none", + logging_strategy="no", + ) + + # Create a VLM processor + processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", use_fast=True, padding_side="left") + + # Verify processor has both required attributes for VLM detection + assert hasattr(processor, "tokenizer") + assert hasattr(processor, "image_processor") + + def dummy_reward_func(completions, **kwargs): + return [1.0] * len(completions) + + # Use LoRA configuration for memory efficiency + lora_config = LoraConfig( + r=4, # Very low rank for minimal memory + lora_alpha=8, + target_modules=["q_proj", "v_proj"], # Minimal target modules + lora_dropout=0.1, + bias="none", + task_type="CAUSAL_LM", + ) + + # Use 4-bit quantization for further memory reduction + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + ) + + original_env = {} + required_env_vars = { + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + "LOCAL_WORLD_SIZE": "1", + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12355", + } + + for key, value in required_env_vars.items(): + original_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + # Test VLM processor with vLLM colocate mode + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + try: + # Load model with quantization for memory efficiency + model = AutoModelForCausalLM.from_pretrained( + "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + quantization_config=quantization_config, + dtype=torch.bfloat16, + ) + + trainer = GRPOTrainer( + model=model, + reward_funcs=dummy_reward_func, + args=config, + train_dataset=dataset, + processing_class=processor, # VLM processor + peft_config=lora_config, # Use LoRA for memory efficiency + ) + + # Should detect VLM processor correctly and allow vLLM + assert trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode" + assert trainer.vllm_mode == "colocate", "Should use colocate mode" + + # Check if signature columns were set properly + if trainer._signature_columns is not None: + # Should include 'image' in signature columns for VLM processors + assert "image" in trainer._signature_columns, ( + "Should include 'image' in signature columns for VLM" + ) + + # Should not emit any warnings about VLM incompatibility + incompatibility_warnings = [ + str(w_item.message) + for w_item in w + if "does not support VLMs" in str(w_item.message) + or "not compatible" in str(w_item.message).lower() + ] + assert len(incompatibility_warnings) == 0, ( + f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}" + ) + + # Test passes if we get this far without exceptions + + except Exception as e: + # If vLLM fails to initialize due to hardware constraints or other issues, that's expected + if any( + keyword in str(e).lower() + for keyword in [ + "outofmemoryerror", + "cuda", + "memory", + "insufficient", + "no such device", + "free memory", + "gpu memory utilization", + "decrease gpu memory", + ] + ): + pytest.skip(f"Skipping vLLM colocate test due to hardware constraints: {e}") + elif "KeyError" in str(e) and "RANK" in str(e): + pytest.skip(f"Skipping vLLM colocate test due to environment setup issues: {e}") + elif "ValueError" in str(e) and "memory" in str(e).lower(): + pytest.skip(f"Skipping vLLM colocate test due to memory constraints: {e}") + else: + raise + finally: + # Restore original environment variables + for key, original_value in original_env.items(): + if original_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = original_value + + release_memory(model, trainer) + + @require_vllm + def test_training_vllm(self): + """Test that training works with vLLM for generation.""" + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, # increase the learning rate to speed up the test + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=8, # reduce the completion length to reduce memory usage + report_to="none", + logging_strategy="no", + use_vllm=True, + ) + + try: + trainer = GRPOTrainer( + model="Qwen/Qwen2.5-0.5B-Instruct", # tiny models are too small for vLLM + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=dataset, + ) + + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + + # Check that the params have changed + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + except Exception as e: + # If vLLM fails to initialize due to hardware constraints or other issues, that's expected + if any( + keyword in str(e).lower() + for keyword in [ + "outofmemoryerror", + "cuda", + "memory", + "insufficient", + "no such device", + "free memory", + "gpu memory utilization", + "decrease gpu memory", + ] + ): + pytest.skip(f"Skipping vLLM training test due to hardware constraints: {e}") + elif "KeyError" in str(e) and "RANK" in str(e): + pytest.skip(f"Skipping vLLM training test due to environment setup issues: {e}") + elif "ValueError" in str(e) and "memory" in str(e).lower(): + pytest.skip(f"Skipping vLLM training test due to memory constraints: {e}") + else: + raise + + release_memory(trainer.model, trainer) diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index c8c22b93987..3ac03d15e63 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -12,20 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import pathlib from unittest.mock import MagicMock import pytest import torch import transformers +from accelerate.utils.memory import release_memory from datasets import load_dataset from packaging.version import parse as parse_version -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers.testing_utils import backend_empty_cache, torch_device from transformers.utils import is_peft_available from trl import SFTConfig, SFTTrainer from trl.trainer.sft_trainer import DataCollatorForLanguageModeling, dft_loss +from .slow.testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS from .testing_utils import ( TrlTestCase, ignore_warnings, @@ -33,6 +37,8 @@ require_flash_attn, require_liger_kernel, require_peft, + require_torch_accelerator, + require_torch_multi_accelerator, require_vision, ) @@ -1723,3 +1729,432 @@ def test_prompt_tuning_peft_model(self): assert not torch.allclose(param, new_param), f"Parameter {n} has not changed" else: raise ValueError(f"Unexpected parameter {n} in model: {trainer.model}") + + +# Slow tests moved from tests/slow/test_sft_slow.py +@pytest.mark.slow +@require_torch_accelerator +@require_peft +class TestSFTTrainerSlow(TrlTestCase): + def setup_method(self): + self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]") + self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]") + self.max_length = 128 + self.peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=8, + bias="none", + task_type="CAUSAL_LM", + ) + + def teardown_method(self): + gc.collect() + backend_empty_cache(torch_device) + gc.collect() + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + def test_sft_trainer_str(self, model_name, packing): + """ + Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + ) + + trainer = SFTTrainer( + model_name, + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + trainer.train() + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + def test_sft_trainer_transformers(self, model_name, packing): + """ + Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_peft + def test_sft_trainer_peft(self, model_name, packing): + """ + Simply tests if passing a transformers model + peft config to `SFTTrainer` loads and runs the trainer as + expected. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + fp16=True, + packing=packing, + max_length=self.max_length, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + def test_sft_trainer_transformers_mp(self, model_name, packing): + """ + Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed + precision. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + fp16=True, # this is sufficient to enable amp + packing=packing, + max_length=self.max_length, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs): + """ + Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed + precision + different scenarios of gradient_checkpointing. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + fp16=True, # this is sufficient to enable amp + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_peft + def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs): + """ + Simply tests if passing a transformers model + PEFT to `SFTTrainer` loads and runs the trainer as expected in + mixed precision + different scenarios of gradient_checkpointing. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + fp16=True, # this is sufficient to enable amp + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS) + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_torch_multi_accelerator + def test_sft_trainer_transformers_mp_gc_device_map( + self, model_name, packing, gradient_checkpointing_kwargs, device_map + ): + """ + Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed + precision + different scenarios of gradient_checkpointing (single, multi-gpu, etc). + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + fp16=True, # this is sufficient to enable amp + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + + model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_peft + @require_bitsandbytes + def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs): + """ + Simply tests if passing a transformers model + PEFT + bnb to `SFTTrainer` loads and runs the trainer as + expected in mixed precision + different scenarios of gradient_checkpointing. + """ + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + packing=packing, + max_length=self.max_length, + fp16=True, # this is sufficient to enable amp + gradient_checkpointing=True, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + + model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_peft + @require_bitsandbytes + def test_sft_trainer_with_chat_format_qlora(self, model_name, packing): + """ + Simply tests if using setup_chat_format with a transformers model + peft + bnb config to `SFTTrainer` loads and + runs the trainer as expected. + """ + train_dataset = load_dataset("trl-internal-testing/dolly-chatml-sft", split="train") + + training_args = SFTConfig( + packing=packing, + max_length=self.max_length, + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=10, + fp16=True, + ) + + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + + model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trainer = SFTTrainer( + model, + args=training_args, + processing_class=tokenizer, + train_dataset=train_dataset, + peft_config=self.peft_config, + ) + + assert isinstance(trainer.model, PeftModel) + + trainer.train() + + release_memory(model, trainer) + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_liger_kernel + def test_sft_trainer_with_liger(self, model_name, packing): + """ + Tests if passing use_liger=True to SFTConfig loads and runs the trainer with AutoLigerKernelForCausalLM as + expected. + """ + import importlib + + def cleanup_liger_patches(trainer): + """Clean up liger_kernel patches by reloading the model's specific module""" + try: + # Get the specific module that was used by the trainer's model + module_path = trainer.model.__module__ + reload_module = importlib.import_module(module_path) + importlib.reload(reload_module) + except Exception: + pass # Continue if reload fails + + training_args = SFTConfig( + output_dir=self.tmp_dir, + logging_strategy="no", + report_to="none", + per_device_train_batch_size=2, + max_steps=2, + packing=packing, + max_length=self.max_length, + use_liger_kernel=True, + ) + + trainer = SFTTrainer( + model_name, + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + ) + + # Ensure cleanup of liger patches after the test + try: + trainer.train() + release_memory(trainer.model, trainer) + finally: + cleanup_liger_patches(trainer) + + @pytest.mark.parametrize("packing", PACKING_OPTIONS) + @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @require_torch_accelerator + def test_train_offloading(self, model_name, packing): + """Test that activation offloading works with SFTTrainer.""" + # Initialize the trainer + training_args = SFTConfig( + output_dir=self.tmp_dir, + activation_offloading=True, + report_to="none", + per_device_train_batch_size=2, + max_steps=2, + packing=packing, + max_length=self.max_length, + ) + trainer = SFTTrainer( + model=model_name, args=training_args, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset + ) + + # Save the initial parameters to compare them later + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + # Train the model + trainer.train() + + # Check that the training loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + # Check the params have changed + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.allclose(param, new_param), f"Parameter {n} has not changed" + + release_memory(trainer.model, trainer) From 73411d28b8a44fb5981fd73e9d048bdd363c8b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 31 Oct 2025 01:23:53 +0000 Subject: [PATCH 2/3] drop testing contants --- .../test_grpo_with_replay_buffer_trainer.py | 14 ++ tests/slow/__init__.py | 13 -- tests/slow/testing_constants.py | 26 ---- tests/test_dpo_trainer.py | 53 ++++--- tests/test_grpo_trainer.py | 25 +++- tests/test_sft_trainer.py | 129 ++++++++++++++---- 6 files changed, 169 insertions(+), 91 deletions(-) delete mode 100644 tests/slow/__init__.py delete mode 100644 tests/slow/testing_constants.py diff --git a/tests/experimental/test_grpo_with_replay_buffer_trainer.py b/tests/experimental/test_grpo_with_replay_buffer_trainer.py index cad66f8034c..26d8ab7e7ba 100644 --- a/tests/experimental/test_grpo_with_replay_buffer_trainer.py +++ b/tests/experimental/test_grpo_with_replay_buffer_trainer.py @@ -1,3 +1,17 @@ +# Copyright 2020-2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import torch from datasets import load_dataset diff --git a/tests/slow/__init__.py b/tests/slow/__init__.py deleted file mode 100644 index a3170185781..00000000000 --- a/tests/slow/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2020-2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/slow/testing_constants.py b/tests/slow/testing_constants.py deleted file mode 100644 index 1dc30320c7f..00000000000 --- a/tests/slow/testing_constants.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2020-2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -MODELS_TO_TEST = [ - "trl-internal-testing/tiny-LlamaForCausalLM-3.2", - "trl-internal-testing/tiny-MistralForCausalLM-0.2", -] - -# We could have also not declared these variables but let's be verbose -PACKING_OPTIONS = [True, False] -GRADIENT_CHECKPOINTING_KWARGS = [None, {"use_reentrant": False}, {"use_reentrant": True}] -DEVICE_MAP_OPTIONS = [{"": 0}, "auto"] - -DPO_LOSS_TYPES = ["sigmoid", "ipo"] -DPO_PRECOMPUTE_LOGITS = [True, False] diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index 212ba644591..d729ffc09cd 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -37,12 +37,6 @@ from trl import DPOConfig, DPOTrainer, FDivergenceType -from .slow.testing_constants import ( - DPO_LOSS_TYPES, - DPO_PRECOMPUTE_LOGITS, - GRADIENT_CHECKPOINTING_KWARGS, - MODELS_TO_TEST, -) from .testing_utils import ( TrlTestCase, require_bitsandbytes, @@ -1542,9 +1536,6 @@ def test_f_divergence_type(self, f_divergence_type, as_string: bool): assert configparser_dict["f_divergence_type"] == f_divergence_type.value -# Slow tests moved from tests/slow/test_dpo_slow.py - - @pytest.mark.slow @require_torch_accelerator @require_peft @@ -1565,9 +1556,15 @@ def teardown_method(self): backend_empty_cache(torch_device) gc.collect() - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + @pytest.mark.parametrize("pre_compute_logits", [True, False]) + @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"]) + @pytest.mark.parametrize( + "model_id", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits): """ A test that tests the simple usage of `DPOTrainer` using a bare model in full precision. @@ -1611,10 +1608,18 @@ def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits): release_memory(model, trainer) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("pre_compute_logits", [True, False]) + @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"]) + @pytest.mark.parametrize( + "model_id", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_peft def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): """ @@ -1667,10 +1672,18 @@ def test_dpo_peft_model(self, model_id, loss_type, pre_compute_logits, gradient_ release_memory(model, trainer) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("pre_compute_logits", DPO_PRECOMPUTE_LOGITS) - @pytest.mark.parametrize("loss_type", DPO_LOSS_TYPES) - @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("pre_compute_logits", [True, False]) + @pytest.mark.parametrize("loss_type", ["sigmoid", "ipo"]) + @pytest.mark.parametrize( + "model_id", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_bitsandbytes @require_peft def test_dpo_peft_model_qlora(self, model_id, loss_type, pre_compute_logits, gradient_checkpointing_kwargs): diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index 0cb24d60a50..a5a39d86642 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -39,7 +39,6 @@ from trl.experimental.gspo_token import GRPOTrainer as GSPOTokenTrainer from trl.trainer.utils import get_kbit_device_map -from .slow.testing_constants import MODELS_TO_TEST from .testing_utils import ( TrlTestCase, require_bitsandbytes, @@ -1787,7 +1786,13 @@ def teardown_method(self): backend_empty_cache(torch_device) gc.collect() - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_liger_kernel def test_training_with_liger_grpo_loss(self, model_name): training_args = GRPOConfig( @@ -1827,7 +1832,13 @@ def test_training_with_liger_grpo_loss(self, model_name): release_memory(model, trainer) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_liger_kernel @require_peft def test_training_with_liger_grpo_loss_and_peft(self, model_name): @@ -1891,7 +1902,13 @@ def test_training_with_liger_grpo_loss_and_peft(self, model_name): release_memory(model, trainer) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_training_with_transformers_paged(self, model_name): """Test that training works with transformers paged implementation (requires GPU).""" if Version(transformers.__version__) < Version("4.57.0"): diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index dcced1b4f99..d1b03bd06d1 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -29,7 +29,6 @@ from trl import SFTConfig, SFTTrainer from trl.trainer.sft_trainer import DataCollatorForLanguageModeling, dft_loss -from .slow.testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS from .testing_utils import ( TrlTestCase, ignore_warnings, @@ -1755,8 +1754,14 @@ def teardown_method(self): backend_empty_cache(torch_device) gc.collect() - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_sft_trainer_str(self, model_name, packing): """ Simply tests if passing a simple str to `SFTTrainer` loads and runs the trainer as expected. @@ -1780,8 +1785,14 @@ def test_sft_trainer_str(self, model_name, packing): trainer.train() - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_sft_trainer_transformers(self, model_name, packing): """ Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected. @@ -1811,8 +1822,14 @@ def test_sft_trainer_transformers(self, model_name, packing): release_memory(model, trainer) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_peft def test_sft_trainer_peft(self, model_name, packing): """ @@ -1848,8 +1865,14 @@ def test_sft_trainer_peft(self, model_name, packing): release_memory(model, trainer) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_sft_trainer_transformers_mp(self, model_name, packing): """ Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed @@ -1881,9 +1904,17 @@ def test_sft_trainer_transformers_mp(self, model_name, packing): release_memory(model, trainer) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_checkpointing_kwargs): """ Simply tests if passing a transformers model to `SFTTrainer` loads and runs the trainer as expected in mixed @@ -1917,9 +1948,17 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec release_memory(model, trainer) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_peft def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient_checkpointing_kwargs): """ @@ -1957,10 +1996,18 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient release_memory(model, trainer) - @pytest.mark.parametrize("device_map", DEVICE_MAP_OPTIONS) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("device_map", [{"": 0}, "auto"]) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_torch_multi_accelerator def test_sft_trainer_transformers_mp_gc_device_map( self, model_name, packing, gradient_checkpointing_kwargs, device_map @@ -1997,9 +2044,17 @@ def test_sft_trainer_transformers_mp_gc_device_map( release_memory(model, trainer) - @pytest.mark.parametrize("gradient_checkpointing_kwargs", GRADIENT_CHECKPOINTING_KWARGS) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize( + "gradient_checkpointing_kwargs", [None, {"use_reentrant": False}, {"use_reentrant": True}] + ) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_peft @require_bitsandbytes def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gradient_checkpointing_kwargs): @@ -2040,8 +2095,14 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr release_memory(model, trainer) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_peft @require_bitsandbytes def test_sft_trainer_with_chat_format_qlora(self, model_name, packing): @@ -2081,8 +2142,14 @@ def test_sft_trainer_with_chat_format_qlora(self, model_name, packing): release_memory(model, trainer) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_liger_kernel def test_sft_trainer_with_liger(self, model_name, packing): """ @@ -2126,8 +2193,14 @@ def cleanup_liger_patches(trainer): finally: cleanup_liger_patches(trainer) - @pytest.mark.parametrize("packing", PACKING_OPTIONS) - @pytest.mark.parametrize("model_name", MODELS_TO_TEST) + @pytest.mark.parametrize("packing", [True, False]) + @pytest.mark.parametrize( + "model_name", + [ + "trl-internal-testing/tiny-LlamaForCausalLM-3.2", + "trl-internal-testing/tiny-MistralForCausalLM-0.2", + ], + ) @require_torch_accelerator def test_train_offloading(self, model_name, packing): """Test that activation offloading works with SFTTrainer.""" From 8705710814df3be2437c9f1012ce0349f8d1c636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 31 Oct 2025 01:24:44 +0000 Subject: [PATCH 3/3] remove comments --- tests/test_grpo_trainer.py | 1 - tests/test_sft_trainer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index a5a39d86642..7e42dbb7a6e 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -1772,7 +1772,6 @@ def test_training(self): assert not torch.equal(param, new_param), f"Parameter {n} has not changed." -# Slow tests moved from tests/slow/test_grpo_slow.py @pytest.mark.slow @require_torch_accelerator class TestGRPOTrainerSlow(TrlTestCase): diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index d1b03bd06d1..7fe7da9dd07 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -1732,7 +1732,6 @@ def test_prompt_tuning_peft_model(self): raise ValueError(f"Unexpected parameter {n} in model: {trainer.model}") -# Slow tests moved from tests/slow/test_sft_slow.py @pytest.mark.slow @require_torch_accelerator @require_peft