diff --git a/README.md b/README.md index fe76091cbc..29f63ce06e 100644 --- a/README.md +++ b/README.md @@ -96,13 +96,13 @@ Every model is written from scratch to maximize performance and remove layers of | Model | Model size | Author | Reference | |----|----|----|----| -| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | -| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | -| Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | -| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | -| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | -| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | -| Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | +| Llama 3 & 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | +| Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | +| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | +| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | +| Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | ... | ... | ... | ... |
@@ -124,16 +124,16 @@ Every model is written from scratch to maximize performance and remove layers of | Gemma | 2B, 7B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf) | | Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | -| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/) | -| Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | -| MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) +| Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | +| MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) | | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | -| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | -| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) +| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | +| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | @@ -173,11 +173,11 @@ After installing LitGPT, select the model and workflow to run (finetune, pretrai ```bash # ligpt [action] [model] -litgpt serve meta-llama/Meta-Llama-3-8B-Instruct -litgpt finetune meta-llama/Meta-Llama-3-8B-Instruct -litgpt pretrain meta-llama/Meta-Llama-3-8B-Instruct -litgpt chat meta-llama/Meta-Llama-3-8B-Instruct -litgpt evaluate meta-llama/Meta-Llama-3-8B-Instruct +litgpt serve meta-llama/Meta-Llama-3.1-8B-Instruct +litgpt finetune meta-llama/Meta-Llama-3.1-8B-Instruct +litgpt pretrain meta-llama/Meta-Llama-3.1-8B-Instruct +litgpt chat meta-llama/Meta-Llama-3.1-8B-Instruct +litgpt evaluate meta-llama/Meta-Llama-3.1-8B-Instruct ```   diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index 3f54023206..a784c9e7bc 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -35,6 +35,10 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset | llama-3-8b/lora.yaml | llama-3-8b | 2 | 512 | 1 | 1xA10G | 14.79 min | $0.4 | 19.73 GB | 0.888 | 2.431 | 62.4% | | llama-3-8b/lora.yaml | llama-3-8b | 2 | 512 | 1 | 4xA10G | 14.88 min | $1.2 | 19.73 GB | 0.889 | 2.432 | 62.5% | | llama-3-8b/qlora.yaml | llama-3-8b | 2 | 512 | 2 | 1xA10G | 22.24 min | $0.7 | 17.41 GB | 0.939 | 2.558 | 62.2% | +| | | | | | | | | | | | | +| llama-3.1-8b/full.yaml | llama-3.1-8b | 1 | 512 | 4 | 1xA10G | OOM | OOM | OOM | OOM | OOM | OOM | +| llama-3.1-8b/lora.yaml | llama-3.1-8b | 2 | 512 | 1 | 1xA10G | 13.36 min | $1.1 | 19.73 GB | 0.878 | 2.406 | xx.xx | +| llama-3.1-8b/qlora.yaml | llama-3.1-8b | 2 | 512 | 2 | 1xA10G | 21.81 min | $0.7 | 17.41 GB | 0.928 | 2.529 | xx.xx | | | | | | | | | | | | | | | mistral-7b-v0.2/lora.yaml | mistral-7b-v0.2 | 4 | 512 | 2 | 1xA10G | 31.00 min | $0.9 | 20.66 GB | 0.801 | 2.228 | 55.7% | | mistral-7b-v0.2/lora.yaml | mistral-7b-v0.2 | 4 | 512 | 2 | 4xA10G | 31.00 min | $2.5 | 20.66 GB | 0.802 | 2.229 | 55.5% | diff --git a/config_hub/finetune/llama-3.1-8b/full.yaml b/config_hub/finetune/llama-3.1-8b/full.yaml new file mode 100644 index 0000000000..fa0f1ab718 --- /dev/null +++ b/config_hub/finetune/llama-3.1-8b/full.yaml @@ -0,0 +1,112 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) +out_dir: out/finetune/full-llama-3.1-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# How many devices/GPUs to use (type: Union[int, str], default: 1) +devices: 4 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume +# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing +# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. +# (type: Union[bool, Literal["auto"], Path], default: False) +resume: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) + global_batch_size: 64 + + # Number of samples per data-parallel rank (type: int, default: 1) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 25 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 600) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.1-8b/lora.yaml b/config_hub/finetune/llama-3.1-8b/lora.yaml new file mode 100644 index 0000000000..26e01b003c --- /dev/null +++ b/config_hub/finetune/llama-3.1-8b/lora.yaml @@ -0,0 +1,136 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/lora-llama-3.1-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 1 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.1-8b/qlora.yaml b/config_hub/finetune/llama-3.1-8b/qlora.yaml new file mode 100644 index 0000000000..e74db4774a --- /dev/null +++ b/config_hub/finetune/llama-3.1-8b/qlora.yaml @@ -0,0 +1,138 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-llama3.1-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.05 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + download_dir: data/alpaca2k + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 2 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/litgpt/config.py b/litgpt/config.py index 9730c71836..0dfe986579 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -877,6 +877,23 @@ def norm_class(self) -> Type: intermediate_size=14336, rope_base=500000, ), + dict( + name="Llama-3.1-8B{}", + hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-8B{}"), + block_size=8192, + vocab_size=128000, + padded_vocab_size=128256, + n_layer=32, + n_head=32, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=14336, + rope_base=500000, + ), # https://huggingface.co/meta-llama/Meta-Llama-3-70B/blob/main/config.json dict( name="Llama-3-70B{}", @@ -896,6 +913,24 @@ def norm_class(self) -> Type: intermediate_size=28672, rope_base=500000, ), + dict( + name="Llama-3.1-70B{}", + hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-70B{}"), + block_size=8192, + vocab_size=128000, + padded_vocab_size=128256, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=28672, + rope_base=500000, + ), ] for c in llama_3: for kind in ("", "-Instruct"): diff --git a/tests/test_config.py b/tests/test_config.py index 4cc97d94e0..b5b5fea0b3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -101,5 +101,3 @@ def test_head_size(head_size): config = Config(head_size) assert config.head_size == head_size or config.n_embd // config.n_head - - diff --git a/tests/test_model.py b/tests/test_model.py index a19d2da46c..22177dc2fe 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -213,6 +213,8 @@ def test_against_original_open_llama_3b(device, dtype): {"name": "Llama-2-70b-chat-hf", "n_query_groups": 1}, {"name": "Llama-3-8B"}, {"name": "Llama-3-8B-Instruct"}, + {"name": "Llama-3.1-8B"}, + {"name": "Llama-3.1-8B-Instruct"}, ], ) @pytest.mark.parametrize( diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index b81f5ef9bb..9a172fde7d 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -17,7 +17,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Gemma | 2B, 7B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf) | | Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | -| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3 & 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/) | | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) @@ -117,6 +117,10 @@ meta-llama/Meta-Llama-3-70B meta-llama/Meta-Llama-3-70B-Instruct meta-llama/Meta-Llama-3-8B meta-llama/Meta-Llama-3-8B-Instruct +meta-llama/Meta-Llama-3.1-70B +meta-llama/Meta-Llama-3.1-70B-Instruct +meta-llama/Meta-Llama-3.1-8B +meta-llama/Meta-Llama-3.1-8B-Instruct microsoft/phi-1_5 microsoft/phi-2 microsoft/Phi-3-mini-4k-instruct