From b58107601ffff6fc27cf709f8ba0468e2ec4b248 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 26 Jun 2024 14:34:36 +0200
Subject: [PATCH 01/21] Fix Mixtral prompt style (#1531)

---
 litgpt/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index 41988d89db..e9adc764e2 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -386,7 +386,7 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Platypus()
     if re.search("Nous-Hermes", model_name):
         return NousResearch()
-    if re.search("CodeLlama|Mistral.*Instruct", model_name):
+    if re.search("CodeLlama|Mi[sx]tral.*Instruct", model_name):
         return CodeLlama()
     if re.search("phi-1", model_name):
         return Phi1()

From 8ff04a974cd29e8a755f1ce2ce2607d23ac3b573 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Wed, 26 Jun 2024 11:29:38 -0500
Subject: [PATCH 02/21] Raise warning when loading a large model on a CPU
 device (#1532)

---
 litgpt/api.py                 |  2 ++
 litgpt/chat/base.py           |  2 ++
 litgpt/generate/adapter.py    |  2 ++
 litgpt/generate/adapter_v2.py |  2 ++
 litgpt/generate/base.py       |  2 ++
 litgpt/generate/full.py       |  3 ++-
 litgpt/utils.py               | 17 +++++++++++++++++
 tests/test_utils.py           | 29 ++++++++++++++++++++++++++++-
 8 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/litgpt/api.py b/litgpt/api.py
index c1cfb199dc..68f02c10bc 100644
--- a/litgpt/api.py
+++ b/litgpt/api.py
@@ -16,6 +16,7 @@
 from litgpt.chat.base import generate as stream_generate_fn
 from litgpt.prompts import load_prompt_style, has_prompt_style, PromptStyle
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -171,6 +172,7 @@ def load(
 
         if checkpoint_dir is not None:
             checkpoint_path = checkpoint_dir / "lit_model.pth"
+            check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
             load_checkpoint(fabric, model, checkpoint_path)
         return cls(
             model=model, tokenizer=tokenizer, devices=devices,
diff --git a/litgpt/chat/base.py b/litgpt/chat/base.py
index 6d27bf6939..d5b2f047fb 100644
--- a/litgpt/chat/base.py
+++ b/litgpt/chat/base.py
@@ -18,6 +18,7 @@
 from litgpt.prompts import has_prompt_style, load_prompt_style
 from litgpt.scripts.merge_lora import merge_lora
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -221,6 +222,7 @@ def main(
     fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
 
     checkpoint_path = checkpoint_dir / "lit_model.pth"
+    check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
 
     # Merge if this is a raw LoRA checkpoint
     if (checkpoint_dir / "lit_model.pth.lora").is_file() and not checkpoint_path.is_file():
diff --git a/litgpt/generate/adapter.py b/litgpt/generate/adapter.py
index c72481fe43..a874a07734 100644
--- a/litgpt/generate/adapter.py
+++ b/litgpt/generate/adapter.py
@@ -17,6 +17,7 @@
 from litgpt.generate.base import generate
 from litgpt.prompts import has_prompt_style, load_prompt_style
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -96,6 +97,7 @@ def main(
     config = Config.from_file(checkpoint_dir / "model_config.yaml")
 
     checkpoint_path = checkpoint_dir / "lit_model.pth"
+    check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
 
     tokenizer = Tokenizer(checkpoint_dir)
     prompt_style = (
diff --git a/litgpt/generate/adapter_v2.py b/litgpt/generate/adapter_v2.py
index ff1abb1373..247a7169ab 100644
--- a/litgpt/generate/adapter_v2.py
+++ b/litgpt/generate/adapter_v2.py
@@ -17,6 +17,7 @@
 from litgpt.generate.base import generate
 from litgpt.prompts import has_prompt_style, load_prompt_style
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -96,6 +97,7 @@ def main(
     config = Config.from_file(checkpoint_dir / "model_config.yaml")
 
     checkpoint_path = checkpoint_dir / "lit_model.pth"
+    check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
 
     tokenizer = Tokenizer(checkpoint_dir)
     prompt_style = (
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
index b5f2a67844..37a2794d42 100644
--- a/litgpt/generate/base.py
+++ b/litgpt/generate/base.py
@@ -19,6 +19,7 @@
 from litgpt.tokenizer import Tokenizer
 from litgpt.prompts import has_prompt_style, load_prompt_style, PromptStyle
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -217,6 +218,7 @@ def main(
     config = Config.from_file(checkpoint_dir / "model_config.yaml")
 
     checkpoint_path = checkpoint_dir / "lit_model.pth"
+    check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
 
     tokenizer = Tokenizer(checkpoint_dir)
     prompt_style = (
diff --git a/litgpt/generate/full.py b/litgpt/generate/full.py
index 78873d1f64..f36c738b94 100644
--- a/litgpt/generate/full.py
+++ b/litgpt/generate/full.py
@@ -16,6 +16,7 @@
 from litgpt.generate.base import generate
 from litgpt.prompts import has_prompt_style, load_prompt_style
 from litgpt.utils import (
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     extend_checkpoint_dir,
     get_default_supported_precision,
@@ -95,7 +96,7 @@ def main(
     config = Config.from_file(checkpoint_dir / "model_config.yaml")
 
     checkpoint_path = finetuned_path
-
+    check_file_size_on_cpu_and_warn(checkpoint_path, fabric.device)
     tokenizer = Tokenizer(checkpoint_dir)
     prompt_style = (
         load_prompt_style(checkpoint_dir) if has_prompt_style(checkpoint_dir) else PromptStyle.from_config(config)
diff --git a/litgpt/utils.py b/litgpt/utils.py
index 891f382421..db4e54d9b2 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -11,6 +11,7 @@
 from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Mapping, Optional, TypeVar, Union
+import warnings
 
 import lightning as L
 import torch
@@ -544,3 +545,19 @@ def extend_checkpoint_dir(checkpoint_dir: Path) -> Path:
                              not checkpoint_dir.is_absolute() and
                              new_checkpoint_dir.exists())
     return new_checkpoint_dir if should_return_new_dir else checkpoint_dir
+
+
+def check_file_size_on_cpu_and_warn(checkpoint_path, device, size_limit=4_509_715_660):
+    """
+    Checks the file size and raises a warning if it exceeds the size_limit.
+    The default size limit is 4.2 GB, the size of TinyLlama 1.1B: 4.2 * 1024 * 1024 * 1024 = 4_509_715_660
+    """
+    size = 0.0
+    if os.path.exists(checkpoint_path):
+        size = os.path.getsize(checkpoint_path)
+        if size > size_limit and str(device) == "cpu":
+            warnings.warn(
+                f"The file size of {checkpoint_path} is over {size_limit/1024/1024/1024:.1f} GB. Using a model "
+                "with more than 1B parameters on a CPU can be slow, it is recommended to switch to a GPU."
+            )
+    return size
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2ea302899a..34271013fa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,7 +5,7 @@
 from contextlib import redirect_stderr
 from io import StringIO
 from pathlib import Path
-from tempfile import TemporaryDirectory
+from tempfile import TemporaryDirectory, NamedTemporaryFile
 from unittest import mock
 
 import pytest
@@ -25,6 +25,7 @@
     CLI,
     CycleIterator,
     capture_hparams,
+    check_file_size_on_cpu_and_warn,
     check_valid_checkpoint_dir,
     choose_logger,
     chunked_cross_entropy,
@@ -426,3 +427,29 @@ def test_extend_checkpoint_dir(input_path, expected):
 ])
 def test_extend_checkpoint_dir_dont_exist(input_path, expected):
     assert extend_checkpoint_dir(input_path) == expected
+
+
+def test_file_size_below_limit_on_cpu():
+    # Test file size below limit on CPU
+    with NamedTemporaryFile() as temp_file:
+        with mock.patch("os.path.getsize", return_value=4_000_000_000):
+            size = check_file_size_on_cpu_and_warn(temp_file.name, "cpu")
+            assert size == 4_000_000_000
+
+
+def test_file_size_above_limit_on_cpu():
+    # Test file size above limit on CPU
+    with NamedTemporaryFile() as temp_file:
+        with mock.patch("os.path.getsize", return_value=4_600_000_000):
+            with pytest.warns(UserWarning) as record:
+                size = check_file_size_on_cpu_and_warn(temp_file.name, "cpu")
+            assert size == 4_600_000_000
+            assert "over 4.2 GB" in str(record[0].message)
+
+
+def test_file_size_above_limit_on_gpu():
+    # Test file size above limit on GPU should not warn
+    with NamedTemporaryFile() as temp_file:
+        with mock.patch("os.path.getsize", return_value=4_600_000_000):
+            size = check_file_size_on_cpu_and_warn(temp_file.name, "gpu")
+            assert size == 4_600_000_000

From 9bb0c172becaf3f251680644f1af6eb5eb007b82 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 12:33:40 -0400
Subject: [PATCH 03/21] Update README.md

---
 README.md | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index b031c65450..e3fe0a1706 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,14 @@
 
 # ⚡ LitGPT
 
-**Load, finetune, pretrain, evaluate, and deploy 20+ LLMs on your own data**
+**Use, finetune, pretrain, evaluate, and deploy 20+ LLMs on your own data**
 
-Uses the latest state-of-the-art techniques:
+High-performance large-language models (LLMs) powered by the latest state-of-the-art techniques:
 
 <pre>
-✅ Scratch implementations  ✅ flash attention  ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
-✅ No abstractions          ✅ FSDP             ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
+✅ From scratch implementations  ✅ No abstractions    ✅ Beginner friendly   
+✅ flash attention               ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
+✅ FSDP                          ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
 </pre>
 
 
@@ -27,7 +28,7 @@ Uses the latest state-of-the-art techniques:
   <a href="#deploy-an-llm">Deploy</a> •    
     <a href="#evaluate-an-llm">Evaluate</a> •
   <a href="#state-of-the-art-features">Features</a> •
-  <a href="#training-recipes">Training recipes (YAML)</a> •
+  <a href="#training-recipes">Recipes (YAML)</a> •
     <a href="#tutorials">Tutorials</a>
 </p>
 
@@ -41,17 +42,17 @@ Uses the latest state-of-the-art techniques:
 
 </div>
 
-# Load, finetune, pretrain, deploy LLMs Lightning fast ⚡⚡
-LitGPT is a library of **lightning-fast** large language model (LLMs) **implemented from scratch** (Apache 2.0) with **no abstractions**.   
+# Use, finetune, pretrain, deploy LLMs Lightning fast ⚡⚡
+Easily work with 20+ LLMs. Every LLM is **implemented from scratch** with **no abstractions** and **full control**. 
 
-We reimplemented all model architectures and training recipes from scratch for 4 reasons:
+As a result, LitGPT large-language-models (LLMs) are blazing fast, minimal, and performant at enterprise scale.
 
 ✅ Apache 2.0 compliance to enable unlimited enterprise use.    
 ✅ Easy debugging/hacking with no abstraction layers and single file implementations.    
 ✅ Optimized model architectures to maximize performance, reduce costs, and speed up training.    
 ✅ Highly-optimized [recipe configs](#training-recipes) we have tested at enterprise scale.    
 
-In addition to a simple Python API, it offers a command-line tool designed to easily [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) [20+ LLMs](#choose-from-20-llms) **on your own data**. It features highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models (LLMs).
+Use the Python API to load and use any model. Use the command-line interface to run advanced workflows to [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) [20+ LLMs](#choose-from-20-llms) **on your own data**. Workflows feature highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models (LLMs).
 
 &nbsp;
 

From d7b1c147ed0ad20915d49485f12a44a4c9723573 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 12:39:58 -0400
Subject: [PATCH 04/21] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e3fe0a1706..9b8e302966 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,11 @@
 
 # ⚡ LitGPT
 
-**Use, finetune, pretrain, evaluate, and deploy 20+ LLMs on your own data**
-
-High-performance large-language models (LLMs) powered by the latest state-of-the-art techniques:
+**20+ high-performance LLM implementations with recipes to pretrain, finetune, deploy at scale.**
 
 <pre>
+Some of the state of the art features supported:  
+      
 ✅ From scratch implementations  ✅ No abstractions    ✅ Beginner friendly   
 ✅ flash attention               ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
 ✅ FSDP                          ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            

From d02d042abff8e4e4e82a47ce8d220fd4732ed889 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 13:24:37 -0400
Subject: [PATCH 05/21] Update README.md

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 9b8e302966..05954d943a 100644
--- a/README.md
+++ b/README.md
@@ -43,16 +43,16 @@ Some of the state of the art features supported:
 </div>
 
 # Use, finetune, pretrain, deploy LLMs Lightning fast ⚡⚡
-Easily work with 20+ LLMs. Every LLM is **implemented from scratch** with **no abstractions** and **full control**. 
+**Easily work with 20+ LLMs:** Every LLM is implemented from scratch with **no abstractions** and **full control**, making them blazing fast, minimal, and performant at an enterprise scale.
 
-As a result, LitGPT large-language-models (LLMs) are blazing fast, minimal, and performant at enterprise scale.
+**Key features**:
 
-✅ Apache 2.0 compliance to enable unlimited enterprise use.    
-✅ Easy debugging/hacking with no abstraction layers and single file implementations.    
-✅ Optimized model architectures to maximize performance, reduce costs, and speed up training.    
-✅ Highly-optimized [recipe configs](#training-recipes) we have tested at enterprise scale.    
+✅ **Enterprise ready -** Apache 2.0 for unlimited enterprise use.    
+✅ **Developer friendly -** Easy debugging with no abstraction layers and single file implementations.    
+✅ **Optimized performance -** Architectures designed to maximize performance, reduce costs, and speed up training.    
+✅ **Proven recipes -** Highly-optimized training/finetuning recipes tested at enterprise scale.    
 
-Use the Python API to load and use any model. Use the command-line interface to run advanced workflows to [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) [20+ LLMs](#choose-from-20-llms) **on your own data**. Workflows feature highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models (LLMs).
+Use the Python API to load any model, or the command-line interface for advanced workflows to [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) LLMs on your own data. Our workflows feature highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models.
 
 &nbsp;
 

From 0b48b4ffe8e6ff942226179752844ab539c6ab01 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 13:25:12 -0400
Subject: [PATCH 06/21] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 05954d943a..9c096df97d 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Some of the state of the art features supported:
 
 ✅ **Enterprise ready -** Apache 2.0 for unlimited enterprise use.    
 ✅ **Developer friendly -** Easy debugging with no abstraction layers and single file implementations.    
-✅ **Optimized performance -** Architectures designed to maximize performance, reduce costs, and speed up training.    
+✅ **Optimized performance -** Models designed to maximize performance, reduce costs, and speed up training.    
 ✅ **Proven recipes -** Highly-optimized training/finetuning recipes tested at enterprise scale.    
 
 Use the Python API to load any model, or the command-line interface for advanced workflows to [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) LLMs on your own data. Our workflows feature highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models.

From d511f9317d98106095bcf6130ea17ae159c7d8e6 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 14:34:55 -0400
Subject: [PATCH 07/21] Shorten the workflow commands

---
 README.md | 133 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 100 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 9c096df97d..f6e8ae9c91 100644
--- a/README.md
+++ b/README.md
@@ -72,11 +72,6 @@ print(text)
 # Corrected Sentence: Every summer, the family enjoys a vacation to the mountains.       
 ```
 
-[Explore the full Python API docs](tutorials/python-api.md).
-
-&nbsp;
-&nbsp;
-
 ✅ Optimized for fast inference    
 ✅ Quantization    
 ✅ Runs on low-memory GPUs    
@@ -99,10 +94,11 @@ pip install -e '.[all]'
 ```
 </details>
 
----
+[Explore the full Python API docs](tutorials/python-api.md).
 
 &nbsp;
 
+---
 # Choose from 20+ LLMs
 LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials/download_model_weights.md) without layers of abstraction:
 
@@ -156,15 +152,15 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 
 </details>
 
----
-
 &nbsp;
 
+---
+
 # Advanced workflows
 Use the command line interface to run advanced workflows such as pretraining or finetuning on your own data.   
 
-## All commands   
-After installing LitGPT, select the model and action you want to take on that model (finetune, pretrain, evaluate, deploy, etc...):
+## All workflows   
+After installing LitGPT, select the model and workflow to run (finetune, pretrain, evaluate, deploy, etc...):
 
 ```bash
 # ligpt [action] [model]
@@ -178,12 +174,20 @@ litgpt  serve     meta-llama/Meta-Llama-3-8B-Instruct
 
 &nbsp;
 
-### Finetune an LLM
-[Finetune](tutorials/finetune.md) a model to specialize it on your own custom dataset:
+---- 
+
+## Finetune an LLM
 
+<div align="center">
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-finetune">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
+</div>
+
+&nbsp;
+
+Finetuning is the process of taking a pretrained AI model and further training it on a smaller, specialized dataset tailored to a specific task or application.
+
 
 &nbsp;
 
@@ -204,15 +208,26 @@ litgpt finetune microsoft/phi-2 \
 litgpt chat out/custom-model/final
 ```
 
+[Read the full finetuning docs](tutorials/finetune.md)
+
 &nbsp;
 
-### Pretrain an LLM
+---- 
 
-[Train an LLM from scratch](tutorials/pretrain.md) on your own data via pretraining:
+## Pretrain an LLM
 
+<div align="center">
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-pretrain">
-<img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg"; alt="Open In Studio"/>
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
+</div>
+
+&nbsp;
+
+Pretraining is the process of teaching an AI model by exposing it to a large amount of data before it is fine-tuned for specific tasks.
+
+<details>
+  <summary>Show code:</summary>
 
 &nbsp;
 
@@ -233,19 +248,34 @@ litgpt pretrain EleutherAI/pythia-160m \
   --train.max_tokens 10_000_000 \
   --out_dir out/custom-model
 
-# 3) Chat with the model
+# 3) Test the model
 litgpt chat out/custom-model/final
 ```
+</details>
+
+[Read the full pretraining docs](tutorials/pretrain.md)
 
 &nbsp;
 
-### Continue pretraining an LLM
+---- 
 
-[Continued pretraining](tutorials/pretrain.md#continued-pretraining-on-custom-data) is another way of finetuning that specializes an already pretrained model by training on custom data:
+## Continue pretraining an LLM
 
+<div align="center">
+<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
+</a>
+</div>
+
+&nbsp;
+
+Continued pretraining is another way of finetuning that specializes an already pretrained model by training on custom data:
+
+<details>
+  <summary>Show code:</summary>
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
-<img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg"; alt="Open In Studio"/>
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
 
 &nbsp;
@@ -267,30 +297,50 @@ litgpt pretrain EleutherAI/pythia-160m \
   --train.max_tokens 10_000_000 \
   --out_dir out/custom-model
 
-# 3) Chat with the model
+# 3) Test the model
 litgpt chat out/custom-model/final
 ```
 
+</details>
+
+[Read the full continued pretraining docs](tutorials/pretrain.md#continued-pretraining-on-custom-data)
+
 &nbsp;
 
-### Evaluate an LLM
+---- 
 
-If you want to [evaluate](tutorials/evaluation.md) a downloaded, finetuned, or pretrained LLM on popular benchmark tasks, such as MMLU and Truthful QA, run the following command:
+## Evaluate an LLM
+Evaluate an LLM to test its performance on various tasks to see how well it understands and generates text. Simply put, we can evaluate things like how well would it do in college-level chemistry, coding, etc... (MMLU, Truthful QA, etc...)
+
+<details>
+  <summary>Show code:</summary>
 
 ```bash
 litgpt evaluate microsoft/phi-2 --tasks 'truthfulqa_mc2,mmlu'
 ```
 
+</details>
+
 [Read the full evaluation docs](tutorials/evaluation.md).
 
 &nbsp;
 
-### Deploy an LLM
-Once you're ready to [deploy](tutorials/deploy.md) a finetuned LLM, run this command:
+---- 
+
+## Deploy an LLM
 
+<div align="center">
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-serve">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/deploy-on-studios.svg" height="36px" alt="Deploy on Studios"/>
 </a>
+</div>
+
+&nbsp;
+
+Deploy a pretrained or finetune LLM to use it in real-world applications. Deploy, automatically sets up a web server that can be accessed by a website or app.   
+
+<details>
+  <summary>Show code:</summary>
 
 &nbsp;
 
@@ -313,6 +363,7 @@ response = requests.post(
 )
 print(response.json()["output"])
 ```
+</details>
 
 [Read the full deploy docs](tutorials/deploy.md).
 
@@ -320,14 +371,26 @@ print(response.json()["output"])
 
 ----
 
-###  Use an LLM for inference
-
-Use LLMs for [inference](tutorials/deploy.md) to test its chatting capabilities, run evaluations, or extract embeddings, etc.
-Here's an example showing how to use the Phi-2 LLM.
+##  Test an LLM
 
+<div align="center">
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-chat">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
+</div>
+
+&nbsp;
+    
+Test how well the model works via an interactive chat. Use the `chat` command to chat, extract embeddings, etc...
+
+<details>
+  <summary>Show code:</summary>
+</details>
+
+Here's an example showing how to use the Phi-2 LLM.
+
+<details>
+  <summary>Show code:</summary>
 
 &nbsp;
 
@@ -345,11 +408,15 @@ litgpt chat microsoft/phi-2
 ```
 
 The download of certain models requires an additional access token. You can read more about this in the [download](tutorials/download_model_weights.md#specific-models-and-access-tokens) documentation. 
-For more information on the different inference options, refer to the [inference](tutorials/inference.md) tutorial.
 
-----
+</details>
+
+[Read the full chat docs](tutorials/inference.md).
+
 &nbsp;
 
+----
+
 # State-of-the-art features
 
 ✅ &nbsp;State-of-the-art optimizations: Flash Attention v2, multi-GPU support via fully-sharded data parallelism, [optional CPU offloading](tutorials/oom.md#do-sharding-across-multiple-gpus), and [TPU and XLA support](extensions/xla).

From 7e9f7dcdf4f981472f13631581bea18c31b7c2c1 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 14:40:14 -0400
Subject: [PATCH 08/21] Workflows menu

---
 README.md | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index f6e8ae9c91..00b454bcc7 100644
--- a/README.md
+++ b/README.md
@@ -157,8 +157,21 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 ---
 
 # Advanced workflows
+
+<p align="center">
+  <a href="#finetune-an-llm">Finetune</a> • 
+  <a href="#pretrain-an-llm">Pretrain</a> • 
+  <a href="#continue-pretraining-an-llm">Continued pretraining</a> •    
+    <a href="#evaluate-an-llm">Evaluate</a> •
+    <a href="#deploy-an-llm">Deploy</a> •
+    <a href="#test-an-llm">Test</a>
+</p>
+
+&nbsp;
+
 Use the command line interface to run advanced workflows such as pretraining or finetuning on your own data.   
 
+
 ## All workflows   
 After installing LitGPT, select the model and workflow to run (finetune, pretrain, evaluate, deploy, etc...):
 
@@ -274,10 +287,6 @@ Continued pretraining is another way of finetuning that specializes an already p
 <details>
   <summary>Show code:</summary>
 
-<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
-</a>
-
 &nbsp;
 
 ```bash
@@ -383,16 +392,12 @@ print(response.json()["output"])
     
 Test how well the model works via an interactive chat. Use the `chat` command to chat, extract embeddings, etc...
 
-<details>
-  <summary>Show code:</summary>
-</details>
-
-Here's an example showing how to use the Phi-2 LLM.
 
 <details>
   <summary>Show code:</summary>
 
 &nbsp;
+Here's an example showing how to use the Phi-2 LLM.
 
 ```bash
 # 1) List all available models in litgpt

From 4b479046fa7bb24813c44b72bc0c61886e38911c Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 14:49:06 -0400
Subject: [PATCH 09/21] Simplify featured projects

---
 README.md | 83 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 00b454bcc7..ddab6a1086 100644
--- a/README.md
+++ b/README.md
@@ -620,67 +620,77 @@ litgpt finetune \
 
 &nbsp;
 
-# Community
-
-We welcome all individual contributors, regardless of their level of experience or hardware. Your contributions are valuable, and we are excited to see what you can accomplish in this collaborative and supportive environment.
+----
 
-- [Request a feature](https://github.com/Lightning-AI/litgpt/issues)    
-- [Submit your first contribution](https://lightning.ai/pages/community/tutorial/how-to-contribute-to-litgpt/)    
-- [Join our Discord](https://discord.gg/VptPCZkGNa)    
+# Project highlights
 
-&nbsp;
+LitGPT powers many great AI projects, initiatives, challenges and of course enterprises. Please submit a pull request to be considered for a feature.   
 
-# Tutorials   
+<details>
+  <summary>📊 SAMBA: Simple Hybrid State Space Models for Efficient Unlimited Context Language Modeling</summary>
 
-🚀 [Get started](tutorials/0_to_litgpt.md)    
-⚡️  [Finetuning, incl. LoRA, QLoRA, and Adapters](tutorials/finetune.md)    
-🤖 [Pretraining](tutorials/pretrain.md)    
-💬 [Model evaluation](tutorials/evaluation.md)    
-📘 [Supported and custom datasets](tutorials/prepare_dataset.md)    
-🧹 [Quantization](tutorials/quantize.md)    
-🤯 [Tips for dealing with out-of-memory (OOM) errors](tutorials/oom.md)   
-🧑🏽‍💻 [Using cloud TPUs](extensions/xla)
+The [Samba](https://github.com/microsoft/Samba) project by researchers at Microsoft is built on top of the LitGPT code base and combines state space models with sliding window attention, which outperforms pure state space models.
 
-&nbsp;
+</details>
 
+<details>
+  <summary>🏆 NeurIPS 2023 Large Language Model Efficiency Challenge: 1 LLM + 1 GPU + 1 Day</summary>
 
-## Projects using LitGPT
+The LitGPT repository was the official starter kit for the [NeurIPS 2023 LLM Efficiency Challenge](https://llm-efficiency-challenge.github.io), which is a competition focused on finetuning an existing non-instruction tuned LLM for 24 hours on a single GPU.
 
-Check out the projects below that use and build on LitGPT. If you have a project you'd like to add to this section, please don't hesitate to open a pull request.
+</details>
 
-&nbsp;
+<details>
+  <summary>🦙 TinyLlama: An Open-Source Small Language Model</summary>
 
-📊 **SAMBA: Simple Hybrid State Space Models for Efficient Unlimited Context Language Modeling**
 
-The [Samba](https://github.com/microsoft/Samba) project by researchers at Microsoft is built on top of the LitGPT code base and combines state space models with sliding window attention, which outperforms pure state space models.
+LitGPT powered the [TinyLlama project](https://github.com/jzhang38/TinyLlama) and [TinyLlama: An Open-Source Small Language Model](https://arxiv.org/abs/2401.02385) research paper.
 
-&nbsp;
+</details>
 
-**🏆 NeurIPS 2023 Large Language Model Efficiency Challenge: 1 LLM + 1 GPU + 1 Day**
+<details>
+  <summary>🍪 MicroLlama: MicroLlama-300M</summary>
 
-The LitGPT repository was the official starter kit for the [NeurIPS 2023 LLM Efficiency Challenge](https://llm-efficiency-challenge.github.io), which is a competition focused on finetuning an existing non-instruction tuned LLM for 24 hours on a single GPU.
+[MicroLlama](https://github.com/keeeeenw/MicroLlama) is a 300M Llama model pretrained on 50B tokens powered by TinyLlama and LitGPT.
+</details>
 
-&nbsp;
+<details>
+  <summary>🔬 Pre-training Small Base LMs with Fewer Tokens</summary>
 
-**🦙 TinyLlama: An Open-Source Small Language Model**
+The research paper ["Pre-training Small Base LMs with Fewer Tokens"](https://arxiv.org/abs/2404.08634), which utilizes LitGPT, develops smaller base language models by inheriting a few transformer blocks from larger models and training on a tiny fraction of the data used by the larger models. It demonstrates that these smaller models can perform comparably to larger models despite using significantly less training data and resources.
 
-LitGPT powered the [TinyLlama project](https://github.com/jzhang38/TinyLlama) and [TinyLlama: An Open-Source Small Language Model](https://arxiv.org/abs/2401.02385) research paper.
+</details>
 
 &nbsp;
 
-**🍪 MicroLlama: MicroLlama-300M**
+----
+
+# Community
 
-[MicroLlama](https://github.com/keeeeenw/MicroLlama) is a 300M Llama model pretrained on 50B tokens powered by TinyLlama and LitGPT.
+We welcome all individual contributors, regardless of their level of experience or hardware. Your contributions are valuable, and we are excited to see what you can accomplish in this collaborative and supportive environment.
+
+- [Request a feature](https://github.com/Lightning-AI/litgpt/issues)    
+- [Submit your first contribution](https://lightning.ai/pages/community/tutorial/how-to-contribute-to-litgpt/)    
+- [Join our Discord](https://discord.gg/VptPCZkGNa)    
 
 &nbsp;
 
-**🔬 Pre-training Small Base LMs with Fewer Tokens**
+# Tutorials   
 
-The research paper ["Pre-training Small Base LMs with Fewer Tokens"](https://arxiv.org/abs/2404.08634), which utilizes LitGPT, develops smaller base language models by inheriting a few transformer blocks from larger models and training on a tiny fraction of the data used by the larger models. It demonstrates that these smaller models can perform comparably to larger models despite using significantly less training data and resources.
+🚀 [Get started](tutorials/0_to_litgpt.md)    
+⚡️  [Finetuning, incl. LoRA, QLoRA, and Adapters](tutorials/finetune.md)    
+🤖 [Pretraining](tutorials/pretrain.md)    
+💬 [Model evaluation](tutorials/evaluation.md)    
+📘 [Supported and custom datasets](tutorials/prepare_dataset.md)    
+🧹 [Quantization](tutorials/quantize.md)    
+🤯 [Tips for dealing with out-of-memory (OOM) errors](tutorials/oom.md)   
+🧑🏽‍💻 [Using cloud TPUs](extensions/xla)
 
 &nbsp;
 
-## Acknowledgements
+----
+
+### Acknowledgements
 
 This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-llama) and [nanoGPT](https://github.com/karpathy/nanoGPT), and it's **powered by [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) ⚡**.
 
@@ -690,14 +700,11 @@ This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-l
 - [@Microsoft](https://github.com/microsoft) for [LoRA](https://github.com/microsoft/LoRA)
 - [@tridao](https://github.com/tridao) for [Flash Attention 2](https://github.com/Dao-AILab/flash-attention)
 
-&nbsp;
-
-## License
+### License
 
 LitGPT is released under the [Apache 2.0](https://github.com/Lightning-AI/litgpt/blob/main/LICENSE) license.
 
-
-## Citation
+### Citation
 
 If you use LitGPT in your research, please cite the following work:
 

From c032d88867cddb5894f23aef45e435921ad0a65e Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 26 Jun 2024 14:55:22 -0400
Subject: [PATCH 10/21] Update README.md

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ddab6a1086..6c87da8be9 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,9 @@ Some of the state of the art features supported:
 </div>
 
 # Use, finetune, pretrain, deploy LLMs Lightning fast ⚡⚡
-**Easily work with 20+ LLMs:** Every LLM is implemented from scratch with **no abstractions** and **full control**, making them blazing fast, minimal, and performant at an enterprise scale.
+Easily work with 20+ LLMs.  
+
+Every LLM is implemented from scratch with **no abstractions** and **full control**, making them blazing fast, minimal, and performant at enterprise scale.
 
 **Key features**:
 
@@ -72,19 +74,17 @@ print(text)
 # Corrected Sentence: Every summer, the family enjoys a vacation to the mountains.       
 ```
 
+&nbsp;
+
 ✅ Optimized for fast inference    
 ✅ Quantization    
 ✅ Runs on low-memory GPUs    
 ✅ No layers of internal abstractions    
 ✅ Optimized for production scale   
 
-&nbsp;
-
 <details>
   <summary>Advanced install options</summary>
 
-&nbsp;
-
 Install from source:
 
 ```bash
@@ -100,7 +100,7 @@ pip install -e '.[all]'
 
 ---
 # Choose from 20+ LLMs
-LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials/download_model_weights.md) without layers of abstraction:
+Every model is written from scratch to maximize performance and remove layers of abstraction:   
 
 | Model | Model size | Author | Reference |
 |----|----|----|----|

From f88cfa9edb3e74a95ec15bc3ec56199d93d52269 Mon Sep 17 00:00:00 2001
From: Mike Jensen <jentfoo@users.noreply.github.com>
Date: Thu, 27 Jun 2024 09:18:00 -0600
Subject: [PATCH 11/21] Tutorial `convert_from_litgpt` doc fix to remove
 `output_dir` command argument (#1533)

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 tutorials/0_to_litgpt.md        | 3 +--
 tutorials/convert_lit_models.md | 8 +++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tutorials/0_to_litgpt.md b/tutorials/0_to_litgpt.md
index e75e188bbb..82ba0efae0 100644
--- a/tutorials/0_to_litgpt.md
+++ b/tutorials/0_to_litgpt.md
@@ -516,8 +516,7 @@ Output: Example input.
 Sometimes, it can be useful to convert LitGPT model weights for third-party and external tools. For example, we can convert a LitGPT model to the Hugging Face format and save it via `.safetensors` files, which we can do as follows:
 
 ```bash
-litgpt convert_from_litgpt microsoft/phi-2 \
-    --output_dir out/converted_model/
+litgpt convert_from_litgpt microsoft/phi-2 out/converted_model/
 ```
 
 Certain tools like the `.from_pretrained` method in Hugging Face `transformers` also require the original `config.json` file that originally came with the downloaded model:
diff --git a/tutorials/convert_lit_models.md b/tutorials/convert_lit_models.md
index 9001db496a..53b24c2bdb 100644
--- a/tutorials/convert_lit_models.md
+++ b/tutorials/convert_lit_models.md
@@ -4,9 +4,8 @@ LitGPT weights need to be converted to a format that Hugging Face understands wi
 
 We provide a helpful command to convert models LitGPT models back to their equivalent Hugging Face Transformers format:
 
-```sh
-litgpt convert_from_litgpt checkpoint_dir \
-    --output_dir converted_dir
+```bash
+litgpt convert_from_litgpt checkpoint_dir converted_dir
 ```
 
 These paths are just placeholders, you will need to customize them based on which finetuning or pretraining command you ran and its configuration.
@@ -98,8 +97,7 @@ litgpt merge_lora $finetuned_dir/final
 4. Convert the finetuning model back into a HF format:
 
 ```bash
-litgpt convert from_litgpt $finetuned_dir/final/ \
-   --output_dir out/hf-tinyllama/converted \
+litgpt convert_from_litgpt $finetuned_dir/final/ out/hf-tinyllama/converted
 ```
 
 

From d3eeb0274030b14e7d4ad65540ba0bd3ff1cbee7 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 27 Jun 2024 16:14:07 -0500
Subject: [PATCH 12/21] Test against eager attention on GPTNeoX (#1537)

---
 tests/test_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_model.py b/tests/test_model.py
index 1cad36a8db..1e54bff75d 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -84,6 +84,7 @@ def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residua
         rotary_pct=ours_config.rotary_percentage,
         vocab_size=ours_config.padded_vocab_size,
         use_parallel_residual=ours_config.parallel_residual,
+        attn_implementation="eager",
     )
 
     state_dict = {}

From 2f2ea8ca44d1fe41bdeb3a06f49da06930272984 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 27 Jun 2024 16:28:16 -0500
Subject: [PATCH 13/21] Bump to 0.4.3.dev0 version (#1536)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3119a52b14..d8f2f290cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "litgpt"
-version = "0.4.2"
+version = "0.4.3.dev0"
 description = "Hackable implementation of state-of-the-art open-source LLMs"
 authors = [
     { name = "Lightning AI", email = "contact@lightning.ai" },

From 32a9c181e54e885d9867aa6baaed6f9482bed6ff Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Fri, 28 Jun 2024 19:10:08 -0400
Subject: [PATCH 14/21] Update README.md

---
 README.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/README.md b/README.md
index 6c87da8be9..384ac8a1c1 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,6 @@
 **20+ high-performance LLM implementations with recipes to pretrain, finetune, deploy at scale.**
 
 <pre>
-Some of the state of the art features supported:  
-      
 ✅ From scratch implementations  ✅ No abstractions    ✅ Beginner friendly   
 ✅ flash attention               ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
 ✅ FSDP                          ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
@@ -43,19 +41,13 @@ Some of the state of the art features supported:
 </div>
 
 # Use, finetune, pretrain, deploy LLMs Lightning fast ⚡⚡
-Easily work with 20+ LLMs.  
-
 Every LLM is implemented from scratch with **no abstractions** and **full control**, making them blazing fast, minimal, and performant at enterprise scale.
 
-**Key features**:
-
 ✅ **Enterprise ready -** Apache 2.0 for unlimited enterprise use.    
 ✅ **Developer friendly -** Easy debugging with no abstraction layers and single file implementations.    
 ✅ **Optimized performance -** Models designed to maximize performance, reduce costs, and speed up training.    
 ✅ **Proven recipes -** Highly-optimized training/finetuning recipes tested at enterprise scale.    
 
-Use the Python API to load any model, or the command-line interface for advanced workflows to [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) LLMs on your own data. Our workflows feature highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models.
-
 &nbsp;
 
 # Quick start

From ceabea78ebe1b66f2fa9b4fdf902cf22cf2ad1b6 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 1 Jul 2024 13:33:06 -0500
Subject: [PATCH 15/21] Fix for LM Eval harness 0.4.3 (#1542)

---
 litgpt/eval/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litgpt/eval/evaluate.py b/litgpt/eval/evaluate.py
index da9604ccc0..f1f5bc8191 100644
--- a/litgpt/eval/evaluate.py
+++ b/litgpt/eval/evaluate.py
@@ -20,7 +20,7 @@ def prepare_results(results, save_filepath, print_results=True):
             print(make_table(results, "groups"))
 
     json_result = json.dumps(
-        results, indent=2, ensure_ascii=False
+        results, indent=2, ensure_ascii=False, default=str
     )
     save_filepath.open("w", encoding="utf-8").write(json_result)
 

From 0663b470ed21024b99c7cd515f98b411d6041349 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 1 Jul 2024 13:55:54 -0500
Subject: [PATCH 16/21] Add `Phi-3-mini-4k-instruct` checkpoint (#1341)

Co-authored-by: Andrei-Aksionov <aksionau.andrei@gmail.com>
Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
---
 README.md                                |   3 +-
 litgpt/config.py                         |  16 +++
 litgpt/prompts.py                        |   9 ++
 litgpt/scripts/convert_hf_checkpoint.py  |  56 ++++++--
 litgpt/scripts/convert_lit_checkpoint.py |  65 ++++++---
 litgpt/tokenizer.py                      |   7 +
 tests/test_convert_hf_checkpoint.py      | 102 +++++++++++++++
 tests/test_convert_lit_checkpoint.py     | 160 ++++++++++++++++++-----
 tests/test_model.py                      |  55 ++++++++
 tests/test_prompts.py                    |   3 +-
 tests/test_tokenizer.py                  |   5 +-
 tutorials/download_model_weights.md      |   4 +-
 12 files changed, 418 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 384ac8a1c1..c517b7d5e1 100644
--- a/README.md
+++ b/README.md
@@ -129,7 +129,8 @@ Every model is written from scratch to maximize performance and remove layers of
 | Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                        |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
-| Phi | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
+| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
+| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
diff --git a/litgpt/config.py b/litgpt/config.py
index 67ee91fa62..423d4f2c26 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1444,6 +1444,22 @@ def norm_class(self) -> Type:
         lm_head_bias=True,
         gelu_approximate="tanh",
     ),
+    # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json
+    dict(
+        name="Phi-3-mini-4k-instruct",
+        hf_config=dict(org="microsoft", name="Phi-3-mini-4k-instruct"),
+        vocab_size=32000,
+        padded_vocab_size=32064,
+        block_size=4096,
+        n_embd=3072,
+        n_layer=32,
+        rotary_percentage=1.0,
+        bias=False,
+        norm_class_name="RMSNorm",
+        intermediate_size=8192,
+        mlp_class_name="LLaMAMLP",
+        parallel_residual=False,
+    ),
 ]
 configs.extend(phi)
 
diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index e9adc764e2..53eae02c69 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -309,6 +309,12 @@ def apply(self, prompt: str, **kwargs: str) -> str:
         return f"Instruct: {prompt}\nOutput:"
 
 
+class Phi3(PromptStyle):
+    def apply(self, prompt: str, **kwargs: str) -> str:
+        return f'<s><|user|>\n{prompt}<|end|>\n<|assistant|>\n'
+
+
+
 class TinyLlama(PromptStyle):
     def apply(self, prompt: str, **kwargs: str) -> str:
         return (
@@ -352,6 +358,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
     "codellama": CodeLlama,
     "phi-1": Phi1,
     "phi-2": Phi2,
+    "phi-3": Phi3,
     "tinyllama": TinyLlama,
     "gemma": Gemma,
     "h2oai": H2Oai,
@@ -392,6 +399,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Phi1()
     if re.search("phi-2", model_name):
         return Phi2()
+    if re.search("Phi-3", model_name):
+        return Phi3()
     if re.search(r"tiny-llama.*chat", model_name):
         return TinyLlama()
     if re.search(r"(Code)?Gemma.*-it", model_name):
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index edca3f3850..464e0ba157 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -12,12 +12,7 @@
 from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor
 
 from litgpt import Config
-from litgpt.utils import (
-    extend_checkpoint_dir,
-    lazy_load,
-    incremental_save,
-    save_config
-)
+from litgpt.utils import extend_checkpoint_dir, incremental_save, lazy_load, save_config
 
 
 def copy_weights_gpt_neox(
@@ -235,13 +230,36 @@ def copy_weights_phi(
         "lm_head.bias": "lm_head.bias",
     }
 
+    if config.name.startswith("Phi-3"):
+        weight_map.update(
+            {
+                "model.layers.{}.self_attn.qkv_proj.weight": "transformer.h.{}.attn.attn.weight",
+                "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+                'model.layers.{}.post_attention_layernorm.weight': "transformer.h.{}.norm_2.weight",
+                "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight",
+                "model.norm.weight": "transformer.ln_f.weight",
+            }
+        )
+
     for name, param in hf_weights.items():
         if name.startswith("model.layers."):
             from_name, l = layer_template(name, 2)
             qkv = qkv_weights.setdefault(l, defaultdict(dict))
+            if "qkv_proj" in from_name:
+                weight = load_param(param, f"layer {l} qkv", dtype)
+                weight = qkv_reassemble(weight, config)
+                to_name = weight_map[from_name].format(l)
+                state_dict[to_name] = weight
+                continue
             if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
                 weight_name, weight_type = from_name.split(".")[-2:]
                 qkv[weight_type][weight_name] = param
+            elif from_name.endswith("gate_up_proj.weight"):
+                weight = load_param(param, f"layer {l} gate_up_proj", dtype)
+                fc_1, fc_2 = weight.chunk(2, dim=0)
+                state_dict[f"transformer.h.{l}.mlp.fc_1.weight"] = fc_1
+                state_dict[f"transformer.h.{l}.mlp.fc_2.weight"] = fc_2
+                continue
             to_name = weight_map[from_name]
             if to_name is None:
                 continue
@@ -272,6 +290,24 @@ def copy_weights_phi(
             del qkv_weights[i][weight_type]
 
 
+def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor:
+    """Reassemble from a normal to an interleaved placement in a QKV matrix.
+    [Q, Q, ..., K, K, ..., V, V, ...] --> [Q, K, V, Q, K, V, ...]
+    """
+    q, k, v = param.split(
+        (
+            config.n_head * config.head_size,
+            config.n_query_groups * config.head_size,
+            config.n_query_groups * config.head_size,
+        )
+    )
+    qs = q.split(config.n_head // config.n_query_groups * config.head_size)
+    ks = k.split(config.head_size)
+    vs = v.split(config.head_size)
+    interleaved = [t for group in zip(qs, ks, vs) for t in group]
+    return torch.cat(interleaved)
+
+
 def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
     split = layer_name.split(".")
     number = int(split[idx])
@@ -321,14 +357,14 @@ def convert_hf_checkpoint(
 
     if "falcon" in model_name:
         copy_fn = partial(copy_weights_falcon, model_name)
-    elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
+    elif model_name.lower().startswith("phi"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
-        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
-    elif "phi" in model_name:
+        copy_fn = partial(copy_weights_phi, config, qkv_weights)
+    elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
-        copy_fn = partial(copy_weights_phi, config, qkv_weights)
+        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
     else:
         copy_fn = copy_weights_gpt_neox
 
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 2f0bd3538b..c67aaea6c6 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -1,6 +1,7 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
 import gc
+from collections import defaultdict
 from functools import partial
 from pathlib import Path
 from pprint import pprint
@@ -11,11 +12,7 @@
 
 from litgpt import Config
 from litgpt.scripts.convert_hf_checkpoint import layer_template, load_param
-from litgpt.utils import (
-    extend_checkpoint_dir,
-    incremental_save,
-    lazy_load
-)
+from litgpt.utils import extend_checkpoint_dir, incremental_save, lazy_load
 
 
 def copy_weights_falcon(
@@ -192,24 +189,49 @@ def copy_weights_phi(
         "lm_head.bias": "lm_head.bias",
     }
 
+    if config.name.startswith("Phi-3"):
+        weight_map.update(
+            {
+                "transformer.h.{}.attn.attn.weight": "model.layers.{}.self_attn.qkv_proj.weight",
+                "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
+                "transformer.h.{}.norm_2.weight": 'model.layers.{}.post_attention_layernorm.weight',
+                "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight",
+                "transformer.ln_f.weight": "model.norm.weight",
+            }
+        )
+        gate_up_proj_weights = defaultdict(dict)
+
+
     for name, param in lit_weights.items():
         if name.endswith((".attn.attn.weight", ".attn.attn.bias")):
-            from_name, l = layer_template(name, 2)
-            weight_type = name.split(".")[-1]  # weight or bias
-            q = f"model.layers.{l}.self_attn.q_proj.{weight_type}"
-            k = f"model.layers.{l}.self_attn.k_proj.{weight_type}"
-            v = f"model.layers.{l}.self_attn.v_proj.{weight_type}"
+            from_name, l_idx = layer_template(name, 2)
             qkv = load_param(param, name, None)
             qp, kp, vp = qkv_split(qkv, config)
-            for to_name, param in zip((q, k, v), (qp, kp, vp)):
+            if config.name.startswith("Phi-3"):
+                qkv_reassembled = torch.concat([qp, kp, vp], dim=0)
+                to_name = weight_map[from_name].format(l_idx)
                 if saver is not None:
-                    param = saver.store_early(param)
-                state_dict[to_name] = param
+                    qkv_reassembled = saver.store_early(qkv_reassembled)
+                state_dict[to_name] = qkv_reassembled
+            else:
+                weight_type = name.split(".")[-1]  # weight or bias
+                q = f"model.layers.{l_idx}.self_attn.q_proj.{weight_type}"
+                k = f"model.layers.{l_idx}.self_attn.k_proj.{weight_type}"
+                v = f"model.layers.{l_idx}.self_attn.v_proj.{weight_type}"
+                for to_name, param in zip((q, k, v), (qp, kp, vp)):
+                    if saver is not None:
+                        param = saver.store_early(param)
+                    state_dict[to_name] = param
+        elif name.endswith((".fc_1.weight", ".fc_2.weight")):
+            from_name, l_idx = layer_template(name, 2)
+            weight = load_param(param, name, None)
+            weight_name = name.split(".")[-2]
+            gate_up_proj_weights[l_idx][weight_name] = weight
         else:
             if "transformer.h" in name:
-                from_name, l = layer_template(name, 2)
+                from_name, l_idx = layer_template(name, 2)
                 to_name = weight_map[from_name]
-                to_name = to_name.format(l)
+                to_name = to_name.format(l_idx)
             else:
                 to_name = weight_map[name]
             param = load_param(param, name, None)
@@ -217,6 +239,15 @@ def copy_weights_phi(
                 param = saver.store_early(param)
             state_dict[to_name] = param
 
+    if config.name.startswith("Phi-3"):
+        for i in list(gate_up_proj_weights):
+            fc_1_weight = gate_up_proj_weights[i]["fc_1"]
+            fc_2_weight = gate_up_proj_weights[i]["fc_2"]
+            weight = torch.concat([fc_1_weight, fc_2_weight], dim=0)
+            layer_name = f"model.layers.{i}.mlp.gate_up_proj.weight"
+            state_dict[layer_name] = weight
+            del gate_up_proj_weights[i]
+
 
 def qkv_split(
     param: Union[torch.Tensor, NotYetLoadedTensor], config: Config
@@ -256,11 +287,11 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
 
     if "falcon" in config.name:
         copy_fn = partial(copy_weights_falcon, config.name)
+    elif config.name.lower().startswith("phi"):
+        copy_fn = partial(copy_weights_phi, config)
     elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         untie_weights = "Gemma" in config.name
         copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
-    elif "phi" in config.name:
-        copy_fn = partial(copy_weights_phi, config)
     else:
         copy_fn = copy_weights_gpt_neox
 
diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py
index 9144173fbc..fbee6818fe 100644
--- a/litgpt/tokenizer.py
+++ b/litgpt/tokenizer.py
@@ -13,6 +13,7 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
         if not checkpoint_dir.exists():
             raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}")
 
+        self.model_name = checkpoint_dir.stem
         self.use_bos = self.check_if_bos_token_used(checkpoint_dir)
         self.bos_id = None
         self.eos_id = None
@@ -114,4 +115,10 @@ def encode(
 
     def decode(self, tensor: torch.Tensor) -> str:
         tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist()
+        # Phi-3 tokenizer strips any spaces if to decode a single token at a time.
+        # https://github.com/huggingface/transformers/issues/31643
+        if self.model_name.startswith("Phi-3") and len(tokens) == 1:
+            dummy_token_id = 33 # \x1e
+            dummy_token = self.processor.decode([dummy_token_id])
+            return self.processor.decode([dummy_token_id] + tokens).replace(dummy_token, "")
         return self.processor.decode(tokens)
diff --git a/tests/test_convert_hf_checkpoint.py b/tests/test_convert_hf_checkpoint.py
index a83d1fe8a7..08749e521d 100644
--- a/tests/test_convert_hf_checkpoint.py
+++ b/tests/test_convert_hf_checkpoint.py
@@ -116,3 +116,105 @@ def test_convert_hf_checkpoint(tmp_path):
     # ensure that the config dict can be loaded
     config = Config.from_file(tmp_path / "model_config.yaml")
     assert isinstance(config, Config)
+
+
+def test_qkv_reassemble():
+    from litgpt import Config
+    from litgpt.scripts.convert_hf_checkpoint import qkv_reassemble
+
+    # MHA
+    config = Config(n_embd=4, n_head=4)
+    qkv = torch.tensor(
+        [
+            [0, 1, 2, 3],  # query
+            [4, 5, 6, 7],  # query
+            [8, 9, 10, 11],  # query
+            [12, 13, 14, 15],  # query
+            [16, 17, 18, 19],  # key
+            [20, 21, 22, 23],  # key
+            [24, 25, 26, 27],  # key
+            [28, 29, 30, 31],  # key
+            [32, 33, 34, 35],  # value
+            [36, 37, 38, 39],  # value
+            [40, 41, 42, 43],  # value
+            [44, 45, 46, 47],  # value
+        ]
+    )
+    qkv_interleaved = qkv_reassemble(qkv, config)
+    torch.testing.assert_close(
+        qkv_interleaved,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [16, 17, 18, 19],  # key
+                [32, 33, 34, 35],  # value
+                [4, 5, 6, 7],  # query
+                [20, 21, 22, 23],  # key
+                [36, 37, 38, 39],  # value
+                [8, 9, 10, 11],  # query
+                [24, 25, 26, 27],  # key
+                [40, 41, 42, 43],  # value
+                [12, 13, 14, 15],  # query
+                [28, 29, 30, 31],  # key
+                [44, 45, 46, 47],  # value
+            ]
+        ),
+    )
+
+    # GQA
+    config = Config(n_embd=4, n_head=4, n_query_groups=2)
+    qkv = torch.tensor(
+        [
+            [0, 1, 2, 3],  # query
+            [4, 5, 6, 7],  # query
+            [8, 9, 10, 11],  # query
+            [12, 13, 14, 15],  # query
+            [16, 17, 18, 19],  # key
+            [20, 21, 22, 23],  # key
+            [24, 25, 26, 27],  # value
+            [28, 29, 30, 31],  # value
+        ]
+    )
+    qkv_interleaved = qkv_reassemble(qkv, config)
+    torch.testing.assert_close(
+        qkv_interleaved,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [4, 5, 6, 7],  # query
+                [16, 17, 18, 19],  # key
+                [24, 25, 26, 27],  # value
+                [8, 9, 10, 11],  # query
+                [12, 13, 14, 15],  # query
+                [20, 21, 22, 23],  # key
+                [28, 29, 30, 31],  # value
+            ]
+        ),
+    )
+
+    # MQA
+    config = Config(n_embd=4, n_head=4, n_query_groups=1)
+    qkv = torch.tensor(
+        [
+            [0, 1, 2, 3],  # query
+            [4, 5, 6, 7],  # query
+            [8, 9, 10, 11],  # query
+            [12, 13, 14, 15],  # query
+            [16, 17, 18, 19],  # key
+            [20, 21, 22, 23],  # value
+        ]
+    )
+    qkv_interleaved = qkv_reassemble(qkv, config)
+    torch.testing.assert_close(
+        qkv_interleaved,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [4, 5, 6, 7],  # query
+                [8, 9, 10, 11],  # query
+                [12, 13, 14, 15],  # query
+                [16, 17, 18, 19],  # key
+                [20, 21, 22, 23],  # value
+            ]
+        ),
+    )
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
index e5b1b889c0..be7f64e6e9 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/test_convert_lit_checkpoint.py
@@ -257,6 +257,49 @@ def test_against_hf_phi(model_name):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ("Phi-3-mini-4k-instruct",))
+def test_against_hf_phi_3(model_name):
+    from transformers.models.phi3.configuration_phi3 import Phi3Config
+    from transformers.models.phi3.modeling_phi3 import Phi3ForCausalLM
+
+    ours_config = Config.from_name(
+        model_name, padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256
+    )
+    T = 5
+    theirs_config = Phi3Config(
+        attention_bias=ours_config.bias,
+        head_dim=ours_config.head_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        num_key_value_heads=ours_config.n_query_groups,
+        pad_token_id=ours_config.padded_vocab_size - 1,
+        partial_rotary_factor=ours_config.rotary_percentage,
+        rms_norm_eps=ours_config.norm_eps,
+        rope_theta=ours_config.rope_base,
+        vocab_size=ours_config.padded_vocab_size,
+    )
+
+    ours_model = GPT(ours_config)
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_phi(ours_config, theirs_state_dict, ours_state_dict)
+    theirs_model = Phi3ForCausalLM(theirs_config)
+    # strict=False because we don't save the rotary embeddings inv frequency
+    keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
+    assert not keys.unexpected_keys
+    assert all("inv_freq" in k for k in keys.missing_keys)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"]
+    torch.testing.assert_close(ours_y, theirs_y)
+
 @torch.inference_mode()
 def test_against_original_stablelm_zephyr_3b():
     T = 5
@@ -364,52 +407,97 @@ def test_check_conversion_supported_lora():
 def test_qkv_split():
     # MHA
     config = Config(n_embd=4, n_head=4)
-    qkv = torch.tensor(
+    qkv_interleaved = torch.tensor(
         [
-            [0, 1, 2, 3],
-            [4, 5, 6, 7],
-            [8, 9, 10, 11],
-            [12, 13, 14, 15],
-            [16, 17, 18, 19],
-            [20, 21, 22, 23],
-            [24, 25, 26, 27],
-            [28, 29, 30, 31],
-            [32, 33, 34, 35],
-            [36, 37, 38, 39],
-            [40, 41, 42, 43],
-            [44, 45, 46, 47],
+            [0, 1, 2, 3],  # query
+            [16, 17, 18, 19],  # key
+            [32, 33, 34, 35],  # value
+            [4, 5, 6, 7],  # query
+            [20, 21, 22, 23],  # key
+            [36, 37, 38, 39],  # value
+            [8, 9, 10, 11],  # query
+            [24, 25, 26, 27],  # key
+            [40, 41, 42, 43],  # value
+            [12, 13, 14, 15],  # query
+            [28, 29, 30, 31],  # key
+            [44, 45, 46, 47],  # value
         ]
     )
-    q, k, v = qkv_split(qkv, config)
-    torch.testing.assert_close(q, torch.tensor([[0, 1, 2, 3], [12, 13, 14, 15], [24, 25, 26, 27], [36, 37, 38, 39]]))
-    torch.testing.assert_close(k, torch.tensor([[4, 5, 6, 7], [16, 17, 18, 19], [28, 29, 30, 31], [40, 41, 42, 43]]))
-    torch.testing.assert_close(v, torch.tensor([[8, 9, 10, 11], [20, 21, 22, 23], [32, 33, 34, 35], [44, 45, 46, 47]]))
+    qkv = torch.cat(qkv_split(qkv_interleaved, config))
+    torch.testing.assert_close(
+        qkv,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [4, 5, 6, 7],  # query
+                [8, 9, 10, 11],  # query
+                [12, 13, 14, 15],  # query
+                [16, 17, 18, 19],  # key
+                [20, 21, 22, 23],  # key
+                [24, 25, 26, 27],  # key
+                [28, 29, 30, 31],  # key
+                [32, 33, 34, 35],  # value
+                [36, 37, 38, 39],  # value
+                [40, 41, 42, 43],  # value
+                [44, 45, 46, 47],  # value
+            ]
+        ),
+    )
 
     # GQA
     config = Config(n_embd=4, n_head=4, n_query_groups=2)
-    qkv = torch.tensor(
+    qkv_interleaved = torch.tensor(
         [
-            [0, 1, 2, 3],
-            [4, 5, 6, 7],
-            [8, 9, 10, 11],
-            [12, 13, 14, 15],
-            [16, 17, 18, 19],
-            [20, 21, 22, 23],
-            [24, 25, 26, 27],
-            [28, 29, 30, 31],
+            [0, 1, 2, 3],  # query
+            [4, 5, 6, 7],  # query
+            [16, 17, 18, 19],  # key
+            [24, 25, 26, 27],  # value
+            [8, 9, 10, 11],  # query
+            [12, 13, 14, 15],  # query
+            [20, 21, 22, 23],  # key
+            [28, 29, 30, 31],  # value
         ]
     )
-    q, k, v = qkv_split(qkv, config)
-    torch.testing.assert_close(q, torch.tensor([[0, 1, 2, 3], [4, 5, 6, 7], [16, 17, 18, 19], [20, 21, 22, 23]]))
-    torch.testing.assert_close(k, torch.tensor([[8, 9, 10, 11], [24, 25, 26, 27]]))
-    torch.testing.assert_close(v, torch.tensor([[12, 13, 14, 15], [28, 29, 30, 31]]))
+    qkv = torch.cat(qkv_split(qkv_interleaved, config))
+    torch.testing.assert_close(
+        qkv,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [4, 5, 6, 7],  # query
+                [8, 9, 10, 11],  # query
+                [12, 13, 14, 15],  # query
+                [16, 17, 18, 19],  # key
+                [20, 21, 22, 23],  # key
+                [24, 25, 26, 27],  # value
+                [28, 29, 30, 31],  # value
+            ]
+        ),
+    )
 
     # MQA
     config = Config(n_embd=4, n_head=4, n_query_groups=1)
-    qkv = torch.tensor(
-        [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+    qkv_interleaved = torch.tensor(
+        [
+            [0, 1, 2, 3],  # query
+            [4, 5, 6, 7],  # query
+            [8, 9, 10, 11],  # query
+            [12, 13, 14, 15],  # query
+            [16, 17, 18, 19],  # key
+            [20, 21, 22, 23],  # value
+        ]
+    )
+    qkv = torch.cat(qkv_split(qkv_interleaved, config))
+    torch.testing.assert_close(
+        qkv,
+        torch.tensor(
+            [
+                [0, 1, 2, 3],  # query
+                [4, 5, 6, 7],  # query
+                [8, 9, 10, 11],  # query
+                [12, 13, 14, 15],  # query
+                [16, 17, 18, 19],  # key
+                [20, 21, 22, 23],  # value
+            ]
+        ),
     )
-    q, k, v = qkv_split(qkv, config)
-    torch.testing.assert_close(q, torch.tensor([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]))
-    torch.testing.assert_close(k, torch.tensor([[16, 17, 18, 19]]))
-    torch.testing.assert_close(v, torch.tensor([[20, 21, 22, 23]]))
diff --git a/tests/test_model.py b/tests/test_model.py
index 1e54bff75d..e4558887b1 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -314,6 +314,61 @@ def test_against_hf_phi(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ("Phi-3-mini-4k-instruct",))
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), RunIf(min_cuda_gpus=1)],
+        ),
+    ],
+)
+def test_against_hf_phi_3(model_name, device, dtype):
+    from transformers.models.phi3.configuration_phi3 import Phi3Config
+    from transformers.models.phi3.modeling_phi3 import Phi3ForCausalLM
+
+    torch.set_default_dtype(dtype)
+
+    ours_config = Config.from_name(
+        model_name, padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256,
+    )
+    T = 5
+    theirs_config = Phi3Config(
+        attention_bias=ours_config.bias,
+        head_dim=ours_config.head_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        num_key_value_heads=ours_config.n_query_groups,
+        pad_token_id=ours_config.padded_vocab_size - 1,
+        partial_rotary_factor=ours_config.rotary_percentage,
+        rms_norm_eps=ours_config.norm_eps,
+        rope_theta=ours_config.rope_base,
+        torch_dtype=dtype,
+        vocab_size=ours_config.padded_vocab_size,
+    )
+
+    theirs_model = Phi3ForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_phi(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize(
     ("device", "dtype"),
diff --git a/tests/test_prompts.py b/tests/test_prompts.py
index 31452fc390..184b642b76 100644
--- a/tests/test_prompts.py
+++ b/tests/test_prompts.py
@@ -6,8 +6,8 @@
 from litgpt import Config
 from litgpt.prompts import (
     Alpaca,
-    Llama3,
     Default,
+    Llama3,
     PromptStyle,
     has_prompt_style,
     load_prompt_style,
@@ -62,6 +62,7 @@ def test_prompt_style_from_config():
         "CodeLlama-70b-Instruct-hf",
         "phi-1_5",
         "phi-2",
+        "Phi-3-mini-4k-instruct",
         "Mistral-7B-Instruct-v0.1",
         "Mistral-7B-Instruct-v0.2",
         "tiny-llama-1.1b-chat",
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 575b005d86..8e8d8f7561 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -12,7 +12,7 @@
 from litgpt.tokenizer import Tokenizer
 
 
-@pytest.mark.flaky(reruns=5, rerun_except=["AssertionError", "assert"])
+@pytest.mark.flaky(reruns=5, rerun_except=["AssertionError", "assert", "TypeError"])
 @pytest.mark.parametrize("config", config_module.configs, ids=[c["hf_config"]["name"] for c in config_module.configs])
 def test_tokenizer_against_hf(config):
     access_token = os.getenv("HF_TOKEN")
@@ -75,6 +75,9 @@ def test_tokenizer_against_hf(config):
         # TODO: there's a encoding difference with this model. why? note that the decoding is equal
         # "Hello": 10994, "▁Hello": 15043
         assert [15043 if t == 10994 else t for t in actual.tolist()] == expected
+    elif config.name.startswith("Phi-3"):
+        # Phi-3 tokenizer adds `bos` twice
+        assert [ours.bos_id] + actual.tolist() == expected
     else:
         assert actual.tolist() == expected
     assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 55f2edf9d1..fc23d184be 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -23,7 +23,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                        |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
-| Phi | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
+| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
+| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
@@ -116,6 +117,7 @@ meta-llama/Meta-Llama-3-8B
 meta-llama/Meta-Llama-3-8B-Instruct
 microsoft/phi-1_5
 microsoft/phi-2
+microsoft/Phi-3-mini-4k-instruct
 mistralai/Mistral-7B-Instruct-v0.1
 mistralai/Mistral-7B-Instruct-v0.2
 mistralai/Mistral-7B-v0.1

From d371478e55e40bd4c5df4526c73b6b15ed8ace2a Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 2 Jul 2024 08:05:20 -0400
Subject: [PATCH 17/21] Update README.md

---
 README.md | 204 +++++++++++++++++++++++++++---------------------------
 1 file changed, 103 insertions(+), 101 deletions(-)

diff --git a/README.md b/README.md
index c517b7d5e1..7b979679f0 100644
--- a/README.md
+++ b/README.md
@@ -198,20 +198,24 @@ Finetuning is the process of taking a pretrained AI model and further training i
 &nbsp;
 
 ```bash
+# 0) setup your dataset
+curl -L https://huggingface.co/datasets/ksaw008/finance_alpaca/resolve/main/finance_alpaca.json -o my_custom_dataset.json
+
 # 1) Download a pretrained model
 litgpt download microsoft/phi-2
 
 # 2) Finetune the model
-curl -L https://huggingface.co/datasets/ksaw008/finance_alpaca/resolve/main/finance_alpaca.json -o my_custom_dataset.json
-
 litgpt finetune microsoft/phi-2 \
   --data JSON \
   --data.json_path my_custom_dataset.json \
   --data.val_split_fraction 0.1 \
   --out_dir out/custom-model
 
-# 3) Chat with the model
+# 3) Test the model
 litgpt chat out/custom-model/final
+
+# 4) Deploy the model
+litgpt serve out/custom-model/final
 ```
 
 [Read the full finetuning docs](tutorials/finetune.md)
@@ -220,126 +224,119 @@ litgpt chat out/custom-model/final
 
 ---- 
 
-## Pretrain an LLM
+## Deploy an LLM
 
 <div align="center">
-<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-pretrain">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
+<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-serve">
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/deploy-on-studios.svg" height="36px" alt="Deploy on Studios"/>
 </a>
 </div>
 
 &nbsp;
 
-Pretraining is the process of teaching an AI model by exposing it to a large amount of data before it is fine-tuned for specific tasks.
+Deploy a pretrained or finetune LLM to use it in real-world applications. Deploy, automatically sets up a web server that can be accessed by a website or app.   
+
+```bash
+# Deploy an out-of-the-box LLM
+litgpt download microsoft/phi-2
+litgpt serve microsoft/phi-2
+
+# deploy your own trained model
+litgpt serve path/to/microsoft/phi-2/checkpoint
+```
 
 <details>
   <summary>Show code:</summary>
 
 &nbsp;
 
-```bash
-mkdir -p custom_texts
-curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
-curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
+Test the server in a separate terminal and integrate the model API into your AI product:
+```python
+# 3) Use the server (in a separate Python session)
+import requests, json
+response = requests.post(
+    "http://127.0.0.1:8000/predict",
+    json={"prompt": "Fix typos in the following sentence: Exampel input"}
+)
+print(response.json()["output"])
+```
+</details>
 
-# 1) Download a tokenizer
-litgpt download EleutherAI/pythia-160m \
-  --tokenizer_only True
+[Read the full deploy docs](tutorials/deploy.md).
 
-# 2) Pretrain the model
-litgpt pretrain EleutherAI/pythia-160m \
-  --tokenizer_dir EleutherAI/pythia-160m \
-  --data TextFiles \
-  --data.train_data_path "custom_texts/" \
-  --train.max_tokens 10_000_000 \
-  --out_dir out/custom-model
+&nbsp;
 
-# 3) Test the model
-litgpt chat out/custom-model/final
+----
+
+## Evaluate an LLM
+Evaluate an LLM to test its performance on various tasks to see how well it understands and generates text. Simply put, we can evaluate things like how well would it do in college-level chemistry, coding, etc... (MMLU, Truthful QA, etc...)
+
+```bash
+litgpt evaluate microsoft/phi-2 --tasks 'truthfulqa_mc2,mmlu'
 ```
-</details>
 
-[Read the full pretraining docs](tutorials/pretrain.md)
+[Read the full evaluation docs](tutorials/evaluation.md).
 
 &nbsp;
 
 ---- 
 
-## Continue pretraining an LLM
+##  Test an LLM
 
 <div align="center">
-<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
+<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-chat">
   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
 </div>
 
 &nbsp;
+    
+Test how well the model works via an interactive chat. Use the `chat` command to chat, extract embeddings, etc...
 
-Continued pretraining is another way of finetuning that specializes an already pretrained model by training on custom data:
-
-<details>
-  <summary>Show code:</summary>
-
-&nbsp;
-
+Here's an example showing how to use the Phi-2 LLM:
 ```bash
-mkdir -p custom_texts
-curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
-curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
-
-# 1) Download a pretrained model
-litgpt download EleutherAI/pythia-160m
-
-# 2) Continue pretraining the model
-litgpt pretrain EleutherAI/pythia-160m \
-  --tokenizer_dir EleutherAI/pythia-160m \
-  --initial_checkpoint_dir EleutherAI/pythia-160m \
-  --data TextFiles \
-  --data.train_data_path "custom_texts/" \
-  --train.max_tokens 10_000_000 \
-  --out_dir out/custom-model
+litgpt chat microsoft/phi-2
 
-# 3) Test the model
-litgpt chat out/custom-model/final
+>> Prompt: What do Llamas eat?
 ```
 
-</details>
-
-[Read the full continued pretraining docs](tutorials/pretrain.md#continued-pretraining-on-custom-data)
+<details>
+  <summary>Full code:</summary>
 
 &nbsp;
 
----- 
-
-## Evaluate an LLM
-Evaluate an LLM to test its performance on various tasks to see how well it understands and generates text. Simply put, we can evaluate things like how well would it do in college-level chemistry, coding, etc... (MMLU, Truthful QA, etc...)
+```bash
+# 1) Download the LLM
+litgpt download list
+litgpt download microsoft/phi-2
 
-<details>
-  <summary>Show code:</summary>
+# 2) Test the model
+litgpt chat microsoft/phi-2
 
-```bash
-litgpt evaluate microsoft/phi-2 --tasks 'truthfulqa_mc2,mmlu'
+>> Prompt: What do Llamas eat?
 ```
 
+The download of certain models requires an additional access token. You can read more about this in the [download](tutorials/download_model_weights.md#specific-models-and-access-tokens) documentation. 
+
 </details>
 
-[Read the full evaluation docs](tutorials/evaluation.md).
+[Read the full chat docs](tutorials/inference.md).
 
 &nbsp;
 
----- 
+----
 
-## Deploy an LLM
+## Pretrain an LLM
 
 <div align="center">
-<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-serve">
-  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/deploy-on-studios.svg" height="36px" alt="Deploy on Studios"/>
+<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-pretrain">
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
 </div>
 
 &nbsp;
 
-Deploy a pretrained or finetune LLM to use it in real-world applications. Deploy, automatically sets up a web server that can be accessed by a website or app.   
+Pretraining is the process of teaching an AI model by exposing it to a large amount of data before it is fine-tuned for specific tasks.
 
 <details>
   <summary>Show code:</summary>
@@ -347,73 +344,78 @@ Deploy a pretrained or finetune LLM to use it in real-world applications. Deploy
 &nbsp;
 
 ```bash
-# locate the checkpoint to your finetuned or pretrained model and call the `serve` command:
-litgpt serve microsoft/phi-2
+mkdir -p custom_texts
+curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
+curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
 
-# Alternative: if you haven't finetuned, download any checkpoint to deploy it:
-litgpt download microsoft/phi-2
-litgpt serve microsoft/phi-2
-```
+# 1) Download a tokenizer
+litgpt download EleutherAI/pythia-160m \
+  --tokenizer_only True
 
-Test the server in a separate terminal and integrate the model API into your AI product:
-```python
-# 3) Use the server (in a separate Python session)
-import requests, json
-response = requests.post(
-    "http://127.0.0.1:8000/predict",
-    json={"prompt": "Fix typos in the following sentence: Exampel input"}
-)
-print(response.json()["output"])
+# 2) Pretrain the model
+litgpt pretrain EleutherAI/pythia-160m \
+  --tokenizer_dir EleutherAI/pythia-160m \
+  --data TextFiles \
+  --data.train_data_path "custom_texts/" \
+  --train.max_tokens 10_000_000 \
+  --out_dir out/custom-model
+
+# 3) Test the model
+litgpt chat out/custom-model/final
 ```
 </details>
 
-[Read the full deploy docs](tutorials/deploy.md).
+[Read the full pretraining docs](tutorials/pretrain.md)
 
 &nbsp;
 
-----
+---- 
 
-##  Test an LLM
+## Continue pretraining an LLM
 
 <div align="center">
-<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-chat">
+<a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/run-on-studio.svg" height="36px" alt="Run on Studios"/>
 </a>
 </div>
 
 &nbsp;
-    
-Test how well the model works via an interactive chat. Use the `chat` command to chat, extract embeddings, etc...
 
+Continued pretraining is another way of finetuning that specializes an already pretrained model by training on custom data:
 
 <details>
   <summary>Show code:</summary>
 
 &nbsp;
-Here's an example showing how to use the Phi-2 LLM.
 
 ```bash
-# 1) List all available models in litgpt
-litgpt download list
+mkdir -p custom_texts
+curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
+curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
 
-# 2) Download a pretrained model
-litgpt download microsoft/phi-2
+# 1) Download a pretrained model
+litgpt download EleutherAI/pythia-160m
 
-# 3) Chat with the model
-litgpt chat microsoft/phi-2
+# 2) Continue pretraining the model
+litgpt pretrain EleutherAI/pythia-160m \
+  --tokenizer_dir EleutherAI/pythia-160m \
+  --initial_checkpoint_dir EleutherAI/pythia-160m \
+  --data TextFiles \
+  --data.train_data_path "custom_texts/" \
+  --train.max_tokens 10_000_000 \
+  --out_dir out/custom-model
 
->> Prompt: What do Llamas eat?
+# 3) Test the model
+litgpt chat out/custom-model/final
 ```
 
-The download of certain models requires an additional access token. You can read more about this in the [download](tutorials/download_model_weights.md#specific-models-and-access-tokens) documentation. 
-
 </details>
 
-[Read the full chat docs](tutorials/inference.md).
+[Read the full continued pretraining docs](tutorials/pretrain.md#continued-pretraining-on-custom-data)
 
 &nbsp;
 
-----
+---- 
 
 # State-of-the-art features
 

From a29608b018d0e2011c24873c33bbe90808e10f5f Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 2 Jul 2024 08:24:47 -0400
Subject: [PATCH 18/21] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7b979679f0..3ae942189d 100644
--- a/README.md
+++ b/README.md
@@ -22,9 +22,9 @@
   <a href="https://lightning.ai/">Lightning AI</a> •
   <a href="#quick-start">Quick start</a> •
   <a href="#choose-from-20-llms">Models</a> •
-  <a href="#finetune-an-llm">Finetune/pretrain</a> • 
+  <a href="#finetune-an-llm">Finetune</a> • 
   <a href="#deploy-an-llm">Deploy</a> •    
-    <a href="#evaluate-an-llm">Evaluate</a> •
+  <a href="#all-workflows">All workflows</a> • 
   <a href="#state-of-the-art-features">Features</a> •
   <a href="#training-recipes">Recipes (YAML)</a> •
     <a href="#tutorials">Tutorials</a>
@@ -149,7 +149,7 @@ Every model is written from scratch to maximize performance and remove layers of
 
 ---
 
-# Advanced workflows
+# Workflows
 
 <p align="center">
   <a href="#finetune-an-llm">Finetune</a> • 
@@ -237,7 +237,7 @@ litgpt serve out/custom-model/final
 Deploy a pretrained or finetune LLM to use it in real-world applications. Deploy, automatically sets up a web server that can be accessed by a website or app.   
 
 ```bash
-# Deploy an out-of-the-box LLM
+# deploy an out-of-the-box LLM
 litgpt download microsoft/phi-2
 litgpt serve microsoft/phi-2
 
@@ -246,7 +246,7 @@ litgpt serve path/to/microsoft/phi-2/checkpoint
 ```
 
 <details>
-  <summary>Show code:</summary>
+  <summary>Show code to query server:</summary>
 
 &nbsp;
 

From 2de8aee2300f62773be89d94df746745aa451ab5 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 2 Jul 2024 08:33:19 -0400
Subject: [PATCH 19/21] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3ae942189d..b6a096da84 100644
--- a/README.md
+++ b/README.md
@@ -61,9 +61,9 @@ Load and use any of the [20+ LLMs](#choose-from-20-llms):
 from litgpt import LLM
 
 llm = LLM.load("microsoft/phi-2")
-text = llm.generate("Correct the spelling: Every summer, the familly enjoys a trip to the mountains.")
+text = llm.generate("Fix the spelling: Every fall, the familly goes to the mountains.")
 print(text)
-# Corrected Sentence: Every summer, the family enjoys a vacation to the mountains.       
+# Corrected Sentence: Every fall, the family goes to the mountains.       
 ```
 
 &nbsp;

From c22e5dfd66d7e04fb64298375f6805243812162f Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 2 Jul 2024 08:40:11 -0400
Subject: [PATCH 20/21] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b6a096da84..878d04b49d 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,9 @@
 **20+ high-performance LLM implementations with recipes to pretrain, finetune, deploy at scale.**
 
 <pre>
-✅ From scratch implementations  ✅ No abstractions    ✅ Beginner friendly   
-✅ flash attention               ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
-✅ FSDP                          ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
+✅ From scratch implementations     ✅ No abstractions    ✅ Beginner friendly   
+✅ Flash attention                  ✅ FSDP               ✅ LoRA, QLoRA, Adapter
+✅ Reduce GPU memory (fp4/8/16/32)  ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
 </pre>
 
 

From 2da3dd2edc1cac0724578a8f4f94941e010e0164 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Tue, 2 Jul 2024 10:07:07 -0500
Subject: [PATCH 21/21] Update phi-3 prompt template (#1544)

---
 litgpt/prompts.py       | 2 +-
 tests/test_tokenizer.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index 53eae02c69..85191bfcdb 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -311,7 +311,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
 
 class Phi3(PromptStyle):
     def apply(self, prompt: str, **kwargs: str) -> str:
-        return f'<s><|user|>\n{prompt}<|end|>\n<|assistant|>\n'
+        return f'<|system|>\nYou are a helpful assistant.<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n'
 
 
 
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 8e8d8f7561..481e4ea3d1 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -75,9 +75,6 @@ def test_tokenizer_against_hf(config):
         # TODO: there's a encoding difference with this model. why? note that the decoding is equal
         # "Hello": 10994, "▁Hello": 15043
         assert [15043 if t == 10994 else t for t in actual.tolist()] == expected
-    elif config.name.startswith("Phi-3"):
-        # Phi-3 tokenizer adds `bos` twice
-        assert [ours.bos_id] + actual.tolist() == expected
     else:
         assert actual.tolist() == expected
     assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)