pytorch · HDCharles · Jun 18, 2024 · Jun 15, 2024 · Jun 15, 2024 · Jun 18, 2024
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+#copied from https://github.com/pytorch-labs/gpt-fast/blob/main/scripts/convert_hf_checkpoint.py
+import json
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from model import ModelArgs
+
+
+@torch.inference_mode()
+def convert_hf_checkpoint(
+ *,
+ checkpoint_dir: Path = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf"),
+ model_name: Optional[str] = None,
+) -> None:
+ if model_name is None:
+ model_name = checkpoint_dir.name
+
+ # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
+ # need to be copied into model.pth.
+ # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
+ # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
+ # currently supported.
+ # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
+ is_llama3 = "Llama-3" in model_name
+ if is_llama3:
+ # Check if we have multiple original/consolidated.NN.pth files and report error
+ # if we do for Llama 3.
+ original_dir = checkpoint_dir / "original"
+ pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+ bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
+ if len(bin_files) > 1:
+ raise ValueError(
+ f"Multiple consolidated.NN.pth files found in {original_dir}. "
+ "Merging them into one model.pth file is not supported for Llama 3.")
+
+
+ config = ModelArgs.from_name(model_name)
+ print(f"Model config {config.__dict__}")
+
+ # Load the json file containing weight mapping
+ if not is_llama3:
+ model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+ assert model_map_json.is_file()
+
+ with open(model_map_json) as json_map:
+ bin_index = json.load(json_map)
+
+ weight_map = {
+ "model.embed_tokens.weight": "tok_embeddings.weight",
+ "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+ "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+ "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+ "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+ 'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+ 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+ "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+ "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+ "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+ "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+ "model.norm.weight": "norm.weight",
+ "lm_head.weight": "output.weight",
+ }
+ bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+ else:
+ # There is no separate pytorch_model.bin.index.json file for llama3.
+ # Instead, we will just use all original/consolidated.NN.pth files.
+ # so, we use model.safetensors.index.json
+ weight_map = None
+ original_dir = checkpoint_dir / "original"
+ pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+ bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
+
+
+ def permute(w, n_head):
+ dim = config.dim
+ return (
+ w.view(n_head, 2, config.head_dim // 2, dim)
+ .transpose(1, 2)
+ .reshape(config.head_dim * n_head, dim)
+ )
+
+ merged_result = {}
+ for file in sorted(bin_files):
+ state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
+ merged_result.update(state_dict)
+ final_result = {}
+ if weight_map is not None:
+ for key, value in merged_result.items():
+ if "layers" in key:
+ abstract_key = re.sub(r'(\d+)', '{}', key)
+ layer_num = re.search(r'\d+', key).group(0)
+ new_key = weight_map[abstract_key]
+ if new_key is None:
+ continue
+ new_key = new_key.format(layer_num)
+ else:
+ new_key = weight_map[key]
+
+ final_result[new_key] = value
+
+ for key in tuple(final_result.keys()):
+ if "wq" in key:
+ q = final_result[key]
+ k = final_result[key.replace("wq", "wk")]
+ v = final_result[key.replace("wq", "wv")]
+ q = permute(q, config.n_head)
+ k = permute(k, config.n_local_heads)
+ final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+ del final_result[key]
+ del final_result[key.replace("wq", "wk")]
+ del final_result[key.replace("wq", "wv")]
+ else:
+ final_result = merged_result
+ print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
+ torch.save(final_result, checkpoint_dir / "model.pth")
+ if is_llama3:
+ original_dir = checkpoint_dir / "original"
+ tokenizer_model = original_dir / "tokenizer.model"
+ tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
+ print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
+ shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description='Convert HuggingFace checkpoint.')
+ parser.add_argument('--checkpoint_dir', type=Path, default=Path("checkpoints/meta-llama/llama-2-7b-chat-hf"))
+ parser.add_argument('--model_name', type=str, default=None)
+
+ args = parser.parse_args()
+ convert_hf_checkpoint(
+ checkpoint_dir=args.checkpoint_dir,
+ model_name=args.model_name,
+ )
diff --git a/scripts/download.py b/scripts/download.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# copied from https://github.com/pytorch-labs/gpt-fast/blob/main/scripts/download.py
+import os
+from typing import Optional
+
+from requests.exceptions import HTTPError
+
+
+def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -> None:
+ from huggingface_hub import snapshot_download
+ os.makedirs(f"checkpoints/{repo_id}", exist_ok=True)
+ try:
+ snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token)
+ except HTTPError as e:
+ if e.response.status_code == 401:
+ print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+ else:
+ raise e
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description='Download data from HuggingFace Hub.')
+ parser.add_argument('--repo_id', type=str, default="checkpoints/meta-llama/llama-2-7b-chat-hf", help='Repository ID to download from.')
+ parser.add_argument('--hf_token', type=str, default=None, help='HuggingFace API token.')
+
+ args = parser.parse_args()
+ hf_download(args.repo_id, args.hf_token)
diff --git a/scripts/prepare.sh b/scripts/prepare.sh
@@ -0,0 +1,4 @@
+python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
+python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -242,7 +242,7 @@ def test_8da4w_gptq_quantizer(self):
  # should be similar to TorchCompileDynamicQuantizer
  precision = torch.bfloat16
  device = "cpu"
- checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+ checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
  model = Transformer.from_name(checkpoint_path.parent.name)
  checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
  model.load_state_dict(checkpoint, assign=True)
@@ -305,7 +305,7 @@ def test_8da4w_quantizer_eval(self):
 
  precision = torch.bfloat16
  device = "cpu"
- checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+ checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
  model = Transformer.from_name(checkpoint_path.parent.name)
  checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
  model.load_state_dict(checkpoint, assign=True)
@@ -341,7 +341,7 @@ def test_gptq_quantizer_int4wo(self):
  torchao._models.llama.model.use_index_put_for_kv_cache = True
  precision = torch.bfloat16
  device = "cuda"
- checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+ checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
  model = Transformer.from_name(checkpoint_path.parent.name)
  checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
  model.load_state_dict(checkpoint, assign=True)
@@ -402,7 +402,7 @@ def test_quantizer_int4wo(self):
  from torchao._models._eval import TransformerEvalWrapper
  precision = torch.bfloat16
  device = "cuda"
- checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+ checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
  model = Transformer.from_name(checkpoint_path.parent.name)
  checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
  model.load_state_dict(checkpoint, assign=True)
@@ -438,7 +438,7 @@ def test_eval_wrapper(self):
  from torchao._models._eval import TransformerEvalWrapper
  precision = torch.bfloat16
  device = "cuda"
- checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+ checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
  model = Transformer.from_name(checkpoint_path.parent.name)
  checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
  model.load_state_dict(checkpoint, assign=True)

diff --git a/torchao/_models/llama/README.md b/torchao/_models/llama/README.md
@@ -0,0 +1,12 @@
+# Llama Benchmarks
+
+The llama folder contains code/scripts for stable benchmarking llama models.
+
+To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b and/or https://huggingface.co/meta-llama/Meta-Llama-3-8B
+and follow the steps to gain access.
+
+Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
+download and convert the model weights
+
+once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking
+directly using `generate.py`.
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
@@ -1,4 +1,4 @@
-export CHECKPOINT_PATH=../../../../gpt-fast/checkpoints # path to checkpoints folder
+export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 
 export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt