From 3caf3d2e42f0225d9a8951f63168e4a46b48864f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Jan 2024 18:25:38 +1100
Subject: [PATCH 1/5] Update sft_trainer.mdx

---
 docs/source/sft_trainer.mdx | 39 ++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
index fcad71ebb6..477fa637ce 100644
--- a/docs/source/sft_trainer.mdx
+++ b/docs/source/sft_trainer.mdx
@@ -413,44 +413,48 @@ Note however, that the amount of performance gain is _dataset dependent_ and in
 
 ### Accelerate fine-tuning 2x using `unsloth`
 
-You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) and even full-finetuning (1.1x faster) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama as well) and Mistral architectures.
-First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth#installation-instructions---conda). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLlamaModel` or `FastMistralModel` as follows:
+You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below:
+
+| 1 A100 40GB     | Dataset   | 🤗 Hugging Face | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM reduction |
+|-----------------|-----------|------------------|------------------------|-----------------|-------------------|
+| Code Llama 34b  | Slim Orca | 1x               | 1.01x                  | **1.94x**       | -22.7%            |
+| Llama-2 7b      | Slim Orca | 1x               | 0.96x                  | **1.87x**       | -39.3%            |
+| Mistral 7b      | Slim Orca | 1x               | 1.17x                  | **1.88x**       | -65.9%            |
+| Tiny Llama 1.1b | Alpaca    | 1x               | 1.55x                  | **2.74x**       | -57.8%            |
+
+First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
 
 ```python
 import torch
-
 from transformers import TrainingArguments
 from trl import SFTTrainer
-from unsloth import FastLlamaModel, FastMistralModel
+from unsloth import FastLanguageModel
 
-max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number.
-dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
-load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
+max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number
 
-# Load Llama model
-model, tokenizer = FastLlamaModel.from_pretrained(
-    model_name = "unsloth/llama-2-7b", # Supports any llama model eg meta-llama/Llama-2-7b-hf
+# Load model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/mistral-7b",
     max_seq_length = max_seq_length,
-    dtype = dtype,
-    load_in_4bit = load_in_4bit,
+    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False
     # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
 )
 
 # Do model patching and add fast LoRA weights
-model = FastLlamaModel.get_peft_model(
+model = FastLanguageModel.get_peft_model(
     model,
     r = 16,
     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                       "gate_proj", "up_proj", "down_proj",],
     lora_alpha = 16,
-    lora_dropout = 0, # Currently only supports dropout = 0
-    bias = "none",    # Currently only supports bias = "none"
+    lora_dropout = 0, # Dropout = 0 is currently optimized
+    bias = "none",    # Bias = "none" is currently optimized
     use_gradient_checkpointing = True,
     random_state = 3407,
-    max_seq_length = max_seq_length,
 )
 
-args = TrainingArguments(output_dir="./output")
+args = TrainingArguments(output_dir = "./output")
 
 trainer = SFTTrainer(
     model = model,
@@ -459,7 +463,6 @@ trainer = SFTTrainer(
     dataset_text_field = "text",
     max_seq_length = max_seq_length,
 )
-
 trainer.train()
 ```
 

From 58e83156dbfd9f71a02870e9394a3f97f99dec76 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Jan 2024 18:31:44 +1100
Subject: [PATCH 2/5] Update sft_trainer.mdx

---
 docs/source/sft_trainer.mdx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
index 477fa637ce..f2ace18b8c 100644
--- a/docs/source/sft_trainer.mdx
+++ b/docs/source/sft_trainer.mdx
@@ -415,12 +415,12 @@ Note however, that the amount of performance gain is _dataset dependent_ and in
 
 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below:
 
-| 1 A100 40GB     | Dataset   | 🤗 Hugging Face | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM reduction |
-|-----------------|-----------|------------------|------------------------|-----------------|-------------------|
-| Code Llama 34b  | Slim Orca | 1x               | 1.01x                  | **1.94x**       | -22.7%            |
-| Llama-2 7b      | Slim Orca | 1x               | 0.96x                  | **1.87x**       | -39.3%            |
-| Mistral 7b      | Slim Orca | 1x               | 1.17x                  | **1.88x**       | -65.9%            |
-| Tiny Llama 1.1b | Alpaca    | 1x               | 1.55x                  | **2.74x**       | -57.8%            |
+| 1 A100 40GB     | Dataset   | 🤗  | 🤗 + Flash Attention | 🦥 Unsloth     | 🦥 VRAM saved |
+|-----------------|-----------|-----|----------------------|-----------------|----------------|
+| Code Llama 34b  | Slim Orca | 1x  | 1.01x                | **1.94x**       | -22.7%         |
+| Llama-2 7b      | Slim Orca | 1x  | 0.96x                | **1.87x**       | -39.3%         |
+| Mistral 7b      | Slim Orca | 1x  | 1.17x                | **1.88x**       | -65.9%         |
+| Tiny Llama 1.1b | Alpaca    | 1x  | 1.55x                | **2.74x**       | -57.8%         |
 
 First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
 

From e87738eb1048fc51cf4d2aa1cfd84e8b6d03208b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Jan 2024 18:39:16 +1100
Subject: [PATCH 3/5] Update dpo_trainer.mdx

---
 docs/source/dpo_trainer.mdx | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index 422361c826..a53f508cba 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -103,41 +103,43 @@ While training and evaluating we record the following reward metrics:
 
 ## Accelerate DPO fine-tuning using `unsloth`
 
-You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) and even full-finetuning (1.1x faster) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is compatible with `DPOTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama as well) and Mistral architectures.
-First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth#installation-instructions---conda). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLlamaModel` or `FastMistralModel` as follows:
+You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:
+
+|  GPU     | Model           | Dataset   | 🤗  | 🤗 + Flash Attention | 🦥 Unsloth     | 🦥 VRAM saved  |
+|----------|-----------------|-----------|------|----------------------|-----------------|----------------|
+| A100 40G | Zephyr 7b       | Ultra Chat| 1x   | 1.24x                | **1.88x**       | -11.6%         |
+| Tesla T4 | Zephyr 7b       | Ultra Chat| 1x   | 1.09x                | **1.55x**       | -18.6%         |
+
+First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
 
 ```python
 import torch
-
 from transformers import TrainingArguments
 from trl import DPOTrainer
-from unsloth import FastLlamaModel, FastMistralModel
+from unsloth import FastLanguageModel
 
 max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number.
-dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
-load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
 
-# Load Llama model
-model, tokenizer = FastLlamaModel.from_pretrained(
-    model_name = "unsloth/llama-2-7b", # Supports any llama model eg meta-llama/Llama-2-7b-hf
+# Load model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft",
     max_seq_length = max_seq_length,
-    dtype = dtype,
-    load_in_4bit = load_in_4bit,
+    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False.
     # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
 )
 
 # Do model patching and add fast LoRA weights
-model = FastLlamaModel.get_peft_model(
+model = FastLanguageModel.get_peft_model(
     model,
     r = 16,
     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                       "gate_proj", "up_proj", "down_proj",],
     lora_alpha = 16,
-    lora_dropout = 0, # Currently only supports dropout = 0
-    bias = "none",    # Currently only supports bias = "none"
+    lora_dropout = 0, # Dropout = 0 is currently optimized
+    bias = "none",    # Bias = "none" is currently optimized
     use_gradient_checkpointing = True,
     random_state = 3407,
-    max_seq_length = max_seq_length,
 )
 
 args = TrainingArguments(output_dir="./output")
@@ -150,7 +152,6 @@ dpo_trainer = DPOTrainer(
     train_dataset=train_dataset,
     tokenizer=tokenizer,
 )
-
 dpo_trainer.train()
 ```
 

From 6d2e0cdf718b32f01ea244f162a082a26e2a1ff6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Jan 2024 19:01:43 +1100
Subject: [PATCH 4/5] Update dpo_trainer.mdx

---
 docs/source/dpo_trainer.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index a53f508cba..30ac6447d8 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -105,10 +105,10 @@ While training and evaluating we record the following reward metrics:
 
 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:
 
-|  GPU     | Model           | Dataset   | 🤗  | 🤗 + Flash Attention | 🦥 Unsloth     | 🦥 VRAM saved  |
-|----------|-----------------|-----------|------|----------------------|-----------------|----------------|
-| A100 40G | Zephyr 7b       | Ultra Chat| 1x   | 1.24x                | **1.88x**       | -11.6%         |
-| Tesla T4 | Zephyr 7b       | Ultra Chat| 1x   | 1.09x                | **1.55x**       | -18.6%         |
+|  GPU     | Model           | Dataset   | 🤗  | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM saved  |
+|----------|-----------------|-----------|------|------------------------|-----------------|----------------|
+| A100 40G | Zephyr 7b       | Ultra Chat| 1x   | 1.24x                  | **1.88x**       | -11.6%         |
+| Tesla T4 | Zephyr 7b       | Ultra Chat| 1x   | 1.09x                  | **1.55x**       | -18.6%         |
 
 First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
 

From ab1c99c8c8908da56c5051f6c1a27169ce614b00 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Jan 2024 19:02:17 +1100
Subject: [PATCH 5/5] Update sft_trainer.mdx

---
 docs/source/sft_trainer.mdx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
index f2ace18b8c..0f9e145019 100644
--- a/docs/source/sft_trainer.mdx
+++ b/docs/source/sft_trainer.mdx
@@ -415,12 +415,12 @@ Note however, that the amount of performance gain is _dataset dependent_ and in
 
 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below:
 
-| 1 A100 40GB     | Dataset   | 🤗  | 🤗 + Flash Attention | 🦥 Unsloth     | 🦥 VRAM saved |
-|-----------------|-----------|-----|----------------------|-----------------|----------------|
-| Code Llama 34b  | Slim Orca | 1x  | 1.01x                | **1.94x**       | -22.7%         |
-| Llama-2 7b      | Slim Orca | 1x  | 0.96x                | **1.87x**       | -39.3%         |
-| Mistral 7b      | Slim Orca | 1x  | 1.17x                | **1.88x**       | -65.9%         |
-| Tiny Llama 1.1b | Alpaca    | 1x  | 1.55x                | **2.74x**       | -57.8%         |
+| 1 A100 40GB     | Dataset   | 🤗  | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM saved |
+|-----------------|-----------|-----|-------------------------|-----------------|----------------|
+| Code Llama 34b  | Slim Orca | 1x  | 1.01x                   | **1.94x**       | -22.7%         |
+| Llama-2 7b      | Slim Orca | 1x  | 0.96x                   | **1.87x**       | -39.3%         |
+| Mistral 7b      | Slim Orca | 1x  | 1.17x                   | **1.88x**       | -65.9%         |
+| Tiny Llama 1.1b | Alpaca    | 1x  | 1.55x                   | **2.74x**       | -57.8%         |
 
 First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows: