From 3caf3d2e42f0225d9a8951f63168e4a46b48864f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Jan 2024 18:25:38 +1100 Subject: [PATCH 1/5] Update sft_trainer.mdx --- docs/source/sft_trainer.mdx | 39 ++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx index fcad71ebb6..477fa637ce 100644 --- a/docs/source/sft_trainer.mdx +++ b/docs/source/sft_trainer.mdx @@ -413,44 +413,48 @@ Note however, that the amount of performance gain is _dataset dependent_ and in ### Accelerate fine-tuning 2x using `unsloth` -You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) and even full-finetuning (1.1x faster) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama as well) and Mistral architectures. -First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth#installation-instructions---conda). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLlamaModel` or `FastMistralModel` as follows: +You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below: + +| 1 A100 40GB | Dataset | 🤗 Hugging Face | 🤗 + Flash Attention 2 | 🦥 Unsloth | 🦥 VRAM reduction | +|-----------------|-----------|------------------|------------------------|-----------------|-------------------| +| Code Llama 34b | Slim Orca | 1x | 1.01x | **1.94x** | -22.7% | +| Llama-2 7b | Slim Orca | 1x | 0.96x | **1.87x** | -39.3% | +| Mistral 7b | Slim Orca | 1x | 1.17x | **1.88x** | -65.9% | +| Tiny Llama 1.1b | Alpaca | 1x | 1.55x | **2.74x** | -57.8% | + +First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: ```python import torch - from transformers import TrainingArguments from trl import SFTTrainer -from unsloth import FastLlamaModel, FastMistralModel +from unsloth import FastLanguageModel -max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number. -dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ -load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. +max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number -# Load Llama model -model, tokenizer = FastLlamaModel.from_pretrained( - model_name = "unsloth/llama-2-7b", # Supports any llama model eg meta-llama/Llama-2-7b-hf +# Load model +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/mistral-7b", max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, + dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ + load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) # Do model patching and add fast LoRA weights -model = FastLlamaModel.get_peft_model( +model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, - lora_dropout = 0, # Currently only supports dropout = 0 - bias = "none", # Currently only supports bias = "none" + lora_dropout = 0, # Dropout = 0 is currently optimized + bias = "none", # Bias = "none" is currently optimized use_gradient_checkpointing = True, random_state = 3407, - max_seq_length = max_seq_length, ) -args = TrainingArguments(output_dir="./output") +args = TrainingArguments(output_dir = "./output") trainer = SFTTrainer( model = model, @@ -459,7 +463,6 @@ trainer = SFTTrainer( dataset_text_field = "text", max_seq_length = max_seq_length, ) - trainer.train() ``` From 58e83156dbfd9f71a02870e9394a3f97f99dec76 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Jan 2024 18:31:44 +1100 Subject: [PATCH 2/5] Update sft_trainer.mdx --- docs/source/sft_trainer.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx index 477fa637ce..f2ace18b8c 100644 --- a/docs/source/sft_trainer.mdx +++ b/docs/source/sft_trainer.mdx @@ -415,12 +415,12 @@ Note however, that the amount of performance gain is _dataset dependent_ and in You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below: -| 1 A100 40GB | Dataset | 🤗 Hugging Face | 🤗 + Flash Attention 2 | 🦥 Unsloth | 🦥 VRAM reduction | -|-----------------|-----------|------------------|------------------------|-----------------|-------------------| -| Code Llama 34b | Slim Orca | 1x | 1.01x | **1.94x** | -22.7% | -| Llama-2 7b | Slim Orca | 1x | 0.96x | **1.87x** | -39.3% | -| Mistral 7b | Slim Orca | 1x | 1.17x | **1.88x** | -65.9% | -| Tiny Llama 1.1b | Alpaca | 1x | 1.55x | **2.74x** | -57.8% | +| 1 A100 40GB | Dataset | 🤗 | 🤗 + Flash Attention | 🦥 Unsloth | 🦥 VRAM saved | +|-----------------|-----------|-----|----------------------|-----------------|----------------| +| Code Llama 34b | Slim Orca | 1x | 1.01x | **1.94x** | -22.7% | +| Llama-2 7b | Slim Orca | 1x | 0.96x | **1.87x** | -39.3% | +| Mistral 7b | Slim Orca | 1x | 1.17x | **1.88x** | -65.9% | +| Tiny Llama 1.1b | Alpaca | 1x | 1.55x | **2.74x** | -57.8% | First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: From e87738eb1048fc51cf4d2aa1cfd84e8b6d03208b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Jan 2024 18:39:16 +1100 Subject: [PATCH 3/5] Update dpo_trainer.mdx --- docs/source/dpo_trainer.mdx | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx index 422361c826..a53f508cba 100644 --- a/docs/source/dpo_trainer.mdx +++ b/docs/source/dpo_trainer.mdx @@ -103,41 +103,43 @@ While training and evaluating we record the following reward metrics: ## Accelerate DPO fine-tuning using `unsloth` -You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) and even full-finetuning (1.1x faster) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is compatible with `DPOTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama as well) and Mistral architectures. -First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth#installation-instructions---conda). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLlamaModel` or `FastMistralModel` as follows: +You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below: + +| GPU | Model | Dataset | 🤗 | 🤗 + Flash Attention | 🦥 Unsloth | 🦥 VRAM saved | +|----------|-----------------|-----------|------|----------------------|-----------------|----------------| +| A100 40G | Zephyr 7b | Ultra Chat| 1x | 1.24x | **1.88x** | -11.6% | +| Tesla T4 | Zephyr 7b | Ultra Chat| 1x | 1.09x | **1.55x** | -18.6% | + +First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: ```python import torch - from transformers import TrainingArguments from trl import DPOTrainer -from unsloth import FastLlamaModel, FastMistralModel +from unsloth import FastLanguageModel max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number. -dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ -load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. -# Load Llama model -model, tokenizer = FastLlamaModel.from_pretrained( - model_name = "unsloth/llama-2-7b", # Supports any llama model eg meta-llama/Llama-2-7b-hf +# Load model +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/zephyr-sft", max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, + dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ + load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False. # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) # Do model patching and add fast LoRA weights -model = FastLlamaModel.get_peft_model( +model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, - lora_dropout = 0, # Currently only supports dropout = 0 - bias = "none", # Currently only supports bias = "none" + lora_dropout = 0, # Dropout = 0 is currently optimized + bias = "none", # Bias = "none" is currently optimized use_gradient_checkpointing = True, random_state = 3407, - max_seq_length = max_seq_length, ) args = TrainingArguments(output_dir="./output") @@ -150,7 +152,6 @@ dpo_trainer = DPOTrainer( train_dataset=train_dataset, tokenizer=tokenizer, ) - dpo_trainer.train() ``` From 6d2e0cdf718b32f01ea244f162a082a26e2a1ff6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Jan 2024 19:01:43 +1100 Subject: [PATCH 4/5] Update dpo_trainer.mdx --- docs/source/dpo_trainer.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx index a53f508cba..30ac6447d8 100644 --- a/docs/source/dpo_trainer.mdx +++ b/docs/source/dpo_trainer.mdx @@ -105,10 +105,10 @@ While training and evaluating we record the following reward metrics: You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below: -| GPU | Model | Dataset | 🤗 | 🤗 + Flash Attention | 🦥 Unsloth | 🦥 VRAM saved | -|----------|-----------------|-----------|------|----------------------|-----------------|----------------| -| A100 40G | Zephyr 7b | Ultra Chat| 1x | 1.24x | **1.88x** | -11.6% | -| Tesla T4 | Zephyr 7b | Ultra Chat| 1x | 1.09x | **1.55x** | -18.6% | +| GPU | Model | Dataset | 🤗 | 🤗 + Flash Attention 2 | 🦥 Unsloth | 🦥 VRAM saved | +|----------|-----------------|-----------|------|------------------------|-----------------|----------------| +| A100 40G | Zephyr 7b | Ultra Chat| 1x | 1.24x | **1.88x** | -11.6% | +| Tesla T4 | Zephyr 7b | Ultra Chat| 1x | 1.09x | **1.55x** | -18.6% | First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: From ab1c99c8c8908da56c5051f6c1a27169ce614b00 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Jan 2024 19:02:17 +1100 Subject: [PATCH 5/5] Update sft_trainer.mdx --- docs/source/sft_trainer.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx index f2ace18b8c..0f9e145019 100644 --- a/docs/source/sft_trainer.mdx +++ b/docs/source/sft_trainer.mdx @@ -415,12 +415,12 @@ Note however, that the amount of performance gain is _dataset dependent_ and in You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below: -| 1 A100 40GB | Dataset | 🤗 | 🤗 + Flash Attention | 🦥 Unsloth | 🦥 VRAM saved | -|-----------------|-----------|-----|----------------------|-----------------|----------------| -| Code Llama 34b | Slim Orca | 1x | 1.01x | **1.94x** | -22.7% | -| Llama-2 7b | Slim Orca | 1x | 0.96x | **1.87x** | -39.3% | -| Mistral 7b | Slim Orca | 1x | 1.17x | **1.88x** | -65.9% | -| Tiny Llama 1.1b | Alpaca | 1x | 1.55x | **2.74x** | -57.8% | +| 1 A100 40GB | Dataset | 🤗 | 🤗 + Flash Attention 2 | 🦥 Unsloth | 🦥 VRAM saved | +|-----------------|-----------|-----|-------------------------|-----------------|----------------| +| Code Llama 34b | Slim Orca | 1x | 1.01x | **1.94x** | -22.7% | +| Llama-2 7b | Slim Orca | 1x | 0.96x | **1.87x** | -39.3% | +| Mistral 7b | Slim Orca | 1x | 1.17x | **1.88x** | -65.9% | +| Tiny Llama 1.1b | Alpaca | 1x | 1.55x | **2.74x** | -57.8% | First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: