Lightning-AI · lantiga · Nov 11, 2024 · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024
@@ -116,8 +116,6 @@ Every model is written from scratch to maximize performance and remove layers of
 |----|----|----|----|
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                                                 |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                                                   |
-| Danube2 | 1.8B | H2O.ai | [H2O.ai](https://h2o.ai/platform/danube-1-8b/)                                                                                             |
-| Dolly | 3B, 7B, 12B | Databricks | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)      |
 | Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae)                                                                                              |
 | FreeWilly2 (Stable Beluga 2) | 70B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                 |
 | Function Calling Llama 2 | 7B | Trelis | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2)                                  |
@@ -126,23 +124,20 @@ Every model is written from scratch to maximize performance and remove layers of
 | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                           |
 | Llama 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                                 |
 | Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/)                                           |
-| LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                                       |
 | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/)                                                                                  |
 | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)                                                                             |
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                  |
-| Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                  |
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
-| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
 | StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |
 | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                             |
 | TinyLlama | 1.1B | Zhang et al. | [Zhang et al. 2023](https://github.com/jzhang38/TinyLlama)                                                                         |
-| Vicuna | 7B, 13B, 33B | LMSYS | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)                                                                          |
+
 
 **Tip**: You can list all available models by running the `litgpt download list` command.
 

@@ -368,85 +368,6 @@ def norm_class(self) -> Type:
     configs.append(copy)
 
 
-###################
-# databricks Dolly
-###################
-dolly = [
-    # https://huggingface.co/databricks/dolly-v2-3b/blob/main/config.json
-    dict(
-        name="dolly-v2-3b",
-        hf_config=dict(org="databricks", name="dolly-v2-3b"),
-        block_size=2048,
-        n_layer=32,
-        n_embd=2560,
-        padded_vocab_size=50280,
-    ),
-    # https://huggingface.co/databricks/dolly-v2-7b/blob/main/config.json
-    dict(
-        name="dolly-v2-7b",
-        hf_config=dict(org="databricks", name="dolly-v2-7b"),
-        block_size=2048,
-        n_layer=32,
-        padded_vocab_size=50280,
-    ),
-    # https://huggingface.co/databricks/dolly-v2-12b/blob/main/config.json
-    dict(
-        name="dolly-v2-12b",
-        hf_config=dict(org="databricks", name="dolly-v2-12b"),
-        block_size=2048,
-        n_layer=36,
-        n_embd=5120,
-        n_head=40,
-        padded_vocab_size=50280,
-    ),
-]
-configs.extend(dolly)
-
-
-####################################
-# togethercomputer RedPajama INCITE
-####################################
-redpajama_incite = [
-    # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
-    dict(
-        name="RedPajama-INCITE-{}-3B-v1",
-        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
-        block_size=2048,
-        n_layer=32,
-        n_embd=2560,
-        padding_multiple=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-    ),
-    # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
-    dict(
-        name="RedPajama-INCITE-7B-{}",
-        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
-        block_size=2048,
-        n_layer=32,
-        padding_multiple=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-    ),
-    # this redirects to the checkpoint above. kept for those who had the old weights already downloaded
-    dict(
-        name="RedPajama-INCITE-{}-7B-v0.1",
-        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
-        block_size=2048,
-        n_layer=32,
-        padding_multiple=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-    ),
-]
-for c in redpajama_incite:
-    for kind in ("Base", "Chat", "Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-
 #################
 # TII UAE Falcon
 #################
@@ -569,232 +490,6 @@ def norm_class(self) -> Type:
 ]
 configs.extend(open_LLaMA)
 
-
-###############
-# LMSYS Vicuna
-###############
-vicuna = [
-    # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
-    dict(
-        name="vicuna-7b-v1.3",
-        hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
-    dict(
-        name="vicuna-13b-v1.3",
-        hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
-    dict(
-        name="vicuna-33b-v1.3",
-        hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=60,
-        n_head=52,
-        n_embd=6656,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=17920,
-    ),
-    # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
-    dict(
-        name="vicuna-7b-v1.5",
-        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
-    dict(
-        name="vicuna-7b-v1.5-16k",
-        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
-        block_size=16384,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_condense_ratio=4,
-    ),
-    # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
-    dict(
-        name="vicuna-13b-v1.5",
-        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
-    dict(
-        name="vicuna-13b-v1.5-16k",
-        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
-        block_size=16384,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        rope_condense_ratio=4,
-    ),
-]
-configs.extend(vicuna)
-
-
-#################
-# LMSYS LongChat
-#################
-long_chat = [
-    # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
-    dict(
-        name="longchat-7b-16k",
-        hf_config=dict(org="lmsys", name="longchat-7b-16k"),
-        block_size=16384,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_condense_ratio=8,
-    ),
-    # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
-    dict(
-        name="longchat-13b-16k",
-        hf_config=dict(org="lmsys", name="longchat-13b-16k"),
-        block_size=16384,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        rope_condense_ratio=8,
-    ),
-]
-configs.extend(long_chat)
-
-
-######################
-# NousResearch Hermes
-######################
-nous_research = [
-    # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
-    dict(
-        name="Nous-Hermes-llama-2-7b",
-        hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
-        padded_vocab_size=32000,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
-    dict(
-        name="Nous-Hermes-13b",
-        hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
-        block_size=2048,
-        vocab_size=32000,
-        padded_vocab_size=32001,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
-    dict(
-        name="Nous-Hermes-Llama2-13b",
-        hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
-        vocab_size=32000,
-        padded_vocab_size=32032,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-]
-configs.extend(nous_research)
-
-
 ###############
 # Meta LLaMA 2
 ###############
@@ -1189,33 +884,6 @@ def norm_class(self) -> Type:
 ]
 configs.extend(codegemma)
 
-################
-# H2Oai Danube2
-################
-danube2 = [
-    # https://huggingface.co/h2oai/h2o-danube2-1.8b-chat/blob/main/config.json
-    dict(
-        name="Danube2-1.8b-chat",
-        hf_config=dict(org="h2oai", name="h2o-danube2-1.8b-chat"),
-        vocab_size=32000,
-        n_layer=24,
-        n_head=32,
-        n_embd=2560,
-        block_size=4096,  # should be 8192 but sliding_window mechanism is not implemented
-        intermediate_size=6912,
-        padding_multiple=64,
-        norm_eps=1e-05,
-        rope_base=10000,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-    )
-]
-configs.extend(danube2)
-
 
 ##########################
 # Stability AI FreeWilly2