From 3e45c1c873af64920572d067f6e34bc2ecccdc21 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Thu, 22 Jun 2023 12:28:46 +0800
Subject: [PATCH 1/2] remove constraints on model name

---
 llmdeploy/serve/fastertransformer/deploy.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/llmdeploy/serve/fastertransformer/deploy.py b/llmdeploy/serve/fastertransformer/deploy.py
index 186d3a56de..a9d2056dc1 100644
--- a/llmdeploy/serve/fastertransformer/deploy.py
+++ b/llmdeploy/serve/fastertransformer/deploy.py
@@ -12,10 +12,6 @@
 import torch
 from sentencepiece import SentencePieceProcessor
 
-supported_models = [
-    'vicuna-7b', 'vicuna-13b', 'llama-7b', 'llama-13b', 'llama-30b',
-    'llama-65b'
-]
 supported_formats = ['llama', 'hf']
 
 
@@ -379,7 +375,7 @@ def main(model_name: str,
 
     Args:
         model_name (str): the name of the to-be-deployed model, such as
-            llama-7b, llama-13b and etc
+            llama-7b, llama-13b, vicuna-7b and etc
         model_path (str): the directory path of the model
         model_format (str): the format of the model, fb or hf. 'fb' stands for
             META's llama format, and 'hf' means huggingface format
@@ -387,10 +383,6 @@ def main(model_name: str,
         dst_path (str): the destination path that saves outputs
         tp (int): the number of GPUs used for tensor parallelism
     """
-    if model_name.lower() not in supported_models:
-        print(f'"{model_name}" is not supported. The supported models are: '
-              f'{supported_models}')
-        exit(-1)
 
     if model_format not in supported_formats:
         print(f'the model format "{model_format}" is not supported. '
@@ -409,7 +401,6 @@ def main(model_name: str,
     if triton_models_path is None:
         exit(-1)
 
-    model_name = model_name.lower()
     if model_format == 'llama':
         res = deploy_llama(model_name, model_path, tokenizer_path,
                            triton_models_path, tp)

From 6e6e18264af0384afcd223d619111c08cc350867 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Thu, 22 Jun 2023 12:31:55 +0800
Subject: [PATCH 2/2] remove duplicate model converter

---
 examples/cpp/llama/llama_ckpt_convert.py | 210 -----------------------
 1 file changed, 210 deletions(-)
 delete mode 100644 examples/cpp/llama/llama_ckpt_convert.py

diff --git a/examples/cpp/llama/llama_ckpt_convert.py b/examples/cpp/llama/llama_ckpt_convert.py
deleted file mode 100644
index a3ae4d1d00..0000000000
--- a/examples/cpp/llama/llama_ckpt_convert.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch
-import fire
-import os.path as osp
-from os import makedirs
-from pathlib import Path
-import safetensors
-from typing import List
-from tqdm import tqdm
-
-
-def import_fb(ckpt_dir: str):
-    checkpoints = []
-    for pattern in ['*.pth', '*.pt']:
-        checkpoints += sorted(Path(ckpt_dir).glob(pattern))
-    print(checkpoints)
-    n_ckpt = len(checkpoints)
-    model_params = {}
-
-    def get_param(name, size):
-        print(name, size)
-        if name not in model_params:
-            model_params[name] = torch.zeros(
-                size, dtype=torch.float16, device='cpu')
-        return model_params[name]
-    for i, ckpt_path in enumerate(checkpoints):
-        ckpt = torch.load(ckpt_path, map_location='cpu')
-        for param_name, param_data in ckpt.items():
-            key = param_name.split('.')[-2]
-            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:  # column-parallel
-                size = param_data.size(0)
-                param = get_param(
-                    param_name, [size * n_ckpt, param_data.size(1)])
-                param.data[size * i: size * (i + 1), :] = param_data
-            elif key in ['w2', 'wo', 'tok_embeddings']:          # row-parallel
-                size = param_data.size(-1)
-                param = get_param(
-                    param_name, [param_data.size(0), size * n_ckpt])
-                param.data[:, size * i: size * (i + 1)] = param_data
-            elif i == 0:
-                param = get_param(param_name, param_data.size())
-                param.data = param_data
-        del ckpt
-
-    for name, param in model_params.items():
-        # transpose all weights as FasterTransformer is expecting column-major weights
-        # (output_dims, input_dims) -> (input_dims, output_dims)
-        key = name.split('.')[-2]
-        if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
-            param.data = param.data.t()
-
-    # concat qkv projection
-    for i in range(1000):
-        _qkv = [f'layers.{i}.attention.{k}.weight' for k in ['wq', 'wk', 'wv']]
-        try:
-            qkv = tuple(map(model_params.pop, _qkv))
-        except KeyError:
-            break
-        qkv = torch.stack(qkv, dim=1)
-        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv
-        print(qkv.shape, qkv.dtype)
-
-    return model_params
-
-
-def permute(x: torch.Tensor):
-    SIZE_PER_HEAD = 128
-    if x.shape[-1] > 1:  # qweights
-        dim = x.shape[-1]
-        n_heads = dim // SIZE_PER_HEAD
-        return x.view(-1, n_heads, 2, dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
-    else:  # scales, zeros
-        dim = x.shape[0]
-        n_heads = dim // SIZE_PER_HEAD
-        return x.view(n_heads, 2, dim // n_heads // 2, 1).transpose(1, 2).reshape(dim, 1)
-
-
-def check_zero(x: torch.Tensor):
-    sum = x.flatten().sum().item()
-    assert sum == 0, str(sum)
-
-
-def import_gptq(path: str):
-    model_params = {}
-
-    _qweight = 'weight'
-    _suffixes = [_qweight]
-    n_split = 3
-    if True:
-        _params = {}
-        for i in tqdm(range(0, n_split)):
-            filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(i + 1, n_split)
-            _tmp = torch.load(osp.join(path, filename), map_location='cpu')
-            _params.update(_tmp)
-        # print('\n'.join(_params.keys()))
-        def get_tensor(name):
-            return _params[name]
-        def get_tensor_transposed(name):
-            return _params[name].t()
-
-    # _qweight = 'qweight'
-    # _suffixes = [_qweight, 'bias', 'scales', 'zeros']
-    # with safetensors.safe_open(path, framework='pt') as f:
-    #     get_tensor = f.get_tensor
-    #     # quantized weights are already in column major, no need to transpose
-    #     get_tensor_transposed = get_tensor
-        for i in range(1000):
-            try:
-                # attention weights
-                _qkvo = [f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo']
-                for suffix in _suffixes:
-                    q, k, v, o = map(get_tensor_transposed, map(('{}.' + suffix).format, _qkvo))
-                    if suffix == 'bias':
-                        check_zero(q), check_zero(k), check_zero(v), check_zero(o)
-                    else:
-                        # q, k has different layout for fb & hf, convert to fb's layout
-                        q = permute(q)
-                        k = permute(k)
-                        if suffix == _qweight:  # weight, qweight
-                            # insert a dimension for splitting heads later
-                            # qkv = torch.cat([q[:, None, :], k[:, None, :], v[:, None, :]], dim=1)
-                            qkv = torch.stack((q, k, v), dim=1)
-                        else:  # scales, zeros
-                            # qkv = torch.cat([q[None, :], k[None, :], v[None, :]], dim=0).squeeze(dim=-1)
-                            qkv = torch.stack((q, k, v), dim=0).squeeze(dim=-1)
-                        for k, v in [('w_qkv', qkv), ('wo', o)]:
-                            model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
-                # ffn weights
-                _w123 = [f'model.layers.{i}.mlp.{t}_proj' for t in ['gate', 'down', 'up']]
-                for suffix in _suffixes:
-                    w1, w2, w3 = map(get_tensor_transposed, map(('{}.' + suffix).format, _w123))
-                    if suffix == 'bias':
-                        check_zero(w1), check_zero(w2), check_zero(w3)
-                    else:
-                        if suffix in ['scales', 'zeros']:
-                            w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
-                        for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
-                            model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
-                other = [('attention_norm.weight', 'input_layernorm.weight'),
-                         ('ffn_norm.weight', 'post_attention_layernorm.weight')]
-                for ours, theirs in other:
-                    model_params[f'layers.{i}.' + ours] = get_tensor(f'model.layers.{i}.' + theirs)
-            except safetensors.SafetensorError:
-                break
-            except KeyError:
-                break
-            print(i)
-
-        other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
-                 ('norm.weight', 'model.norm.weight'),
-                 ('output.weight', 'lm_head.weight')]
-        for ours, theirs in other:
-            model_params[ours] = get_tensor(theirs)
-
-        return model_params
-
-
-def export(model_params: dict, out_dir: str, n_inference: int):
-    makedirs(out_dir, exist_ok=True)
-
-    def save_bin(param: torch.Tensor, name):
-        print(name, param.shape)
-        if param.dtype in [torch.float, torch.bfloat16]:
-            param = param.half()
-        param.contiguous().numpy().tofile(osp.join(out_dir, name))
-
-    # reverse the spliting axes since the weights are transposed above
-    for param_name, param_data in model_params.items():
-        split_dim = None
-        key, ext = param_name.split('.')[-2:]
-        copy = False
-        if key in ['w1', 'w3', 'w_qkv']:
-            split_dim = -1
-        elif key in ['w2', 'wo']:
-            if ext in ['scales', 'zeros']:
-                copy = True
-            else:
-                split_dim = 0
-        if split_dim is not None:
-            print(f'*** spliting {param_name}, shape={param_data.shape}, split_dim={split_dim}')
-            assert param_data.shape[split_dim] % n_inference == 0
-            split_size = param_data.shape[split_dim] // n_inference
-            splits = torch.split(param_data, split_size, dim=split_dim)
-            for i, split in enumerate(splits):
-                prefix, ext = osp.splitext(param_name)
-                save_bin(split, f'{prefix}.{i}{ext}')
-        elif copy:
-            print(f'### copying {param_name}, shape={param_data.shape}')
-            copies = [param_data] * n_inference
-            for i, copy in enumerate(copies):
-                prefix, ext = osp.splitext(param_name)
-                save_bin(copy, f'{prefix}.{i}{ext}')
-        else:
-            save_bin(param_data, param_name)
-
-
-def main(kind: str, input_path: str, out_dir: str, n_inference: int = 1):
-    if kind == 'fb':
-        model_params = import_fb(input_path)
-    elif kind == 'gptq':
-        model_params = import_gptq(input_path)
-    else:
-        raise RuntimeError(f'Unsupported kind: {kind}')
-
-    export(model_params, out_dir, n_inference)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)
\ No newline at end of file