Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[Model Enabling] llama3-8b-instruct-chat Enabling #225

Merged
merged 10 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ Neural Speed supports the following models:
</tr>
</thead>
<tbody>
<tr>
<td><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" target="_blank" rel="noopener noreferrer">Meta-Llama-3-8B-Instruct</a></td>
<td>✅</td>
<td></td>
<td></td>
<td></td>
<td>✅</td>
<td></td>
<td></td>
<td></td>
<td>Latest</td>
<td>8192</td>
</tr>
<tr>
<td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
<a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
Expand Down
169 changes: 101 additions & 68 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,54 +20,57 @@
from neural_speed.convert import convert_model

model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
vocab_size_map = {"llama3": 128256}
max_request_num_default = 1


def _import_package(model_type):
if model_type == "gptj":
import neural_speed.gptj_cpp as cpp_model
elif model_type == "falcon":
import neural_speed.falcon_cpp as cpp_model
elif model_type == "gptneox":
import neural_speed.gptneox_cpp as cpp_model
elif model_type == "dolly":
import neural_speed.dolly_cpp as cpp_model
elif model_type == "llama" or model_type == "llama2":
import neural_speed.llama_cpp as cpp_model
elif model_type == "mpt":
import neural_speed.mpt_cpp as cpp_model
elif model_type == "gpt_bigcode" or model_type == "starcoder":
import neural_speed.starcoder_cpp as cpp_model
elif model_type == "opt":
import neural_speed.opt_cpp as cpp_model
elif model_type == "bloom":
import neural_speed.bloom_cpp as cpp_model
elif model_type == "chatglm":
import neural_speed.chatglm_cpp as cpp_model
elif model_type == "chatglm2" or model_type == "chatglm3":
import neural_speed.chatglm2_cpp as cpp_model
elif model_type == "baichuan":
import neural_speed.baichuan_cpp as cpp_model
elif model_type == "polyglot":
import neural_speed.polyglot_cpp as cpp_model
elif model_type == "qwen":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "mistral":
import neural_speed.mistral_cpp as cpp_model
elif model_type == "qwen2":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "phi":
import neural_speed.phi_cpp as cpp_model
elif model_type == "gemma":
import neural_speed.gemma_cpp as cpp_model
elif model_type == "stablelm":
import neural_speed.stablelm_cpp as cpp_model
elif model_type == "whisper":
import neural_speed.whisper_cpp as cpp_model
elif model_type == "mixtral":
import neural_speed.mixtral_cpp as cpp_model
else:
raise TypeError("Unsupported model type {}!".format(model_type))
return cpp_model
if model_type == "gptj":
import neural_speed.gptj_cpp as cpp_model
elif model_type == "falcon":
import neural_speed.falcon_cpp as cpp_model
elif model_type == "gptneox":
import neural_speed.gptneox_cpp as cpp_model
elif model_type == "dolly":
import neural_speed.dolly_cpp as cpp_model
elif model_type == "llama" or model_type == "llama2":
import neural_speed.llama_cpp as cpp_model
elif model_type == "mpt":
import neural_speed.mpt_cpp as cpp_model
elif model_type == "gpt_bigcode" or model_type == "starcoder":
import neural_speed.starcoder_cpp as cpp_model
elif model_type == "opt":
import neural_speed.opt_cpp as cpp_model
elif model_type == "bloom":
import neural_speed.bloom_cpp as cpp_model
elif model_type == "chatglm":
import neural_speed.chatglm_cpp as cpp_model
elif model_type == "chatglm2" or model_type == "chatglm3":
import neural_speed.chatglm2_cpp as cpp_model
elif model_type == "baichuan":
import neural_speed.baichuan_cpp as cpp_model
elif model_type == "polyglot":
import neural_speed.polyglot_cpp as cpp_model
elif model_type == "qwen":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "mistral":
import neural_speed.mistral_cpp as cpp_model
elif model_type == "qwen2":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "phi":
import neural_speed.phi_cpp as cpp_model
elif model_type == "gemma":
import neural_speed.gemma_cpp as cpp_model
elif model_type == "stablelm":
import neural_speed.stablelm_cpp as cpp_model
elif model_type == "whisper":
import neural_speed.whisper_cpp as cpp_model
elif model_type == "mixtral":
import neural_speed.mixtral_cpp as cpp_model
else:
raise TypeError("Unsupported model type {}!".format(model_type))
return cpp_model


def _get_model_config(model_name, model_hub="huggingface"):
if model_hub == "modelscope":
Expand All @@ -78,6 +81,7 @@ def _get_model_config(model_name, model_hub="huggingface"):
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
return config


def _get_model_type(model_config):
model_type = model_maps.get(model_config.model_type, model_config.model_type)
if model_type == "chatglm" and "chatglm2" in model_config._name_or_path:
Expand All @@ -98,21 +102,24 @@ def _get_model_type(model_config):

return model_type


def _filter_model_args(valid_args, **input_kwargs):
invalid_args = []
for k in input_kwargs.keys():
if k not in valid_args:
invalid_args.append(k)
for k in invalid_args:
input_kwargs.pop(k)
return input_kwargs
invalid_args = []
for k in input_kwargs.keys():
if k not in valid_args:
invalid_args.append(k)
for k in invalid_args:
input_kwargs.pop(k)
return input_kwargs


def get_cpp_module(model_name, model_hub="huggingface"):
model_config = _get_model_config(model_name, model_hub=model_hub)
model_type = _get_model_type(model_config)
cpp_module = _import_package(model_type)
return cpp_module


class Model:

def __init__(self):
Expand All @@ -123,6 +130,7 @@ def __init__(self):
self.generate_round = 0
self.max_request_num = -1
self.reinit_from_bin = False
self.tokenizer = None

def init(self,
model_name,
Expand All @@ -140,6 +148,11 @@ def init(self,
self.config = _get_model_config(model_name, model_hub=model_hub)
model_type = _get_model_type(self.config)
self.model_type = model_type
if self.__check_llama3():
print("The model_type: Llama3.")
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)

if self.module is None:
self.module = _import_package(model_type)

Expand All @@ -165,8 +178,8 @@ def init(self,

quant_desc = model_config['quantization_config'].get("quant_method", None)
if quant_desc is None:
print("Error: No quant_method info in model config...")
exit(0)
print("Error: No quant_method info in model config...")
exit(0)
quant_bin = "{}/ne_{}_q_{}.bin".format(output_path, model_type, quant_desc)

if not use_quant:
Expand Down Expand Up @@ -294,11 +307,12 @@ def get_scratch_size_ratio(size):
else:
generate_kwargs["scratch_size_ratio"] = 35

valid_args = {"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty",
"num_beams", "do_sample", "top_k", "top_p", "temperature", "min_new_tokens",
"length_penalty", "early_stopping", "n_keep", "n_discard", "shift_roped_k",
"batch_size","pad_token", "memory_dtype", "continuous_batching", "max_request_num",
"scratch_size_ratio"}
valid_args = {
"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty", "num_beams", "do_sample",
"top_k", "top_p", "temperature", "min_new_tokens", "length_penalty", "early_stopping", "n_keep",
"n_discard", "shift_roped_k", "batch_size", "pad_token", "memory_dtype", "continuous_batching",
"max_request_num", "scratch_size_ratio"
}
self.model.init_model(model_path, **_filter_model_args(valid_args, **generate_kwargs))

def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
Expand All @@ -314,6 +328,10 @@ def generate(self,
stopping_criteria=None,
**generate_kwargs):
batch_size = input_ids.shape[0]
if self.__check_llama3():
if int(input_ids[0][0]) != self.tokenizer.bos_token_id:
bos_token_tensor = torch.tensor([[self.tokenizer.bos_token_id]])
input_ids = torch.cat((bos_token_tensor, input_ids), dim=1)

max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
self.reinit_from_bin = False
Expand Down Expand Up @@ -366,7 +384,7 @@ def generate(self,
if stopping_criteria is not None:
if stopping_criteria(torch.tensor(ret), None):
break
elif ret[0][-1] == self.__get_eos_id() or \
elif ret[0][-1] == self.__get_eos_id() or ret[0][-1] == self.__get_special_eos_id() or \
(max_new_tokens != -1 and out_count >= max_new_tokens):
break
if streamer:
Expand All @@ -380,6 +398,19 @@ def generate(self,
def is_token_end(self):
return self.model.is_token_end()

def __check_llama3(self):
if self.model_type == "llama" and self.config.vocab_size == vocab_size_map["llama3"]:
return True
return False

def __get_special_eos_id(self):
if self.__check_llama3():
eot_id = self.tokenizer("<|eot_id|>")["input_ids"][0]

return eot_id

return self.model.get_eos_id()

def __get_eos_id(self):
return self.model.get_eos_id()

Expand Down Expand Up @@ -438,7 +469,7 @@ def _cont_batching_input(self, input_ids, pad_token_id=None, padding_side="left"
if padding_side == "left":
del input_list[il][0:count]
elif padding_side == "right":
del input_list[il][len(input_list[il]) - count :]
del input_list[il][len(input_list[il]) - count:]
else:
raise ValueError("padding_side must be 'left' or 'right'.")
assert input_list[il] != [], "there are all pad tokens in batch {}.".format(il)
Expand All @@ -465,17 +496,18 @@ def _get_model_input_list(self, input_ids, **kwargs):


class ModelServer:

def __init__(self, model_name, reponse_function, model_path, **server_kwargs):
if not os.path.exists(model_path):
raise ValueError("model file {} does not exist.".format(model_path))
self.module = get_cpp_module(model_name)
valid_args = {"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty",
"num_beams", "do_sample", "top_k", "top_p", "temperature", "min_new_tokens",
"length_penalty", "early_stopping", "n_keep", "n_discard", "shift_roped_k",
"batch_size","pad_token", "memory_dtype", "continuous_batching", "max_request_num",
"scratch_size_ratio", "return_prompt", "print_log", "init_cb"}
self.cpp_server = self.module.ModelServer(reponse_function,
model_path,
valid_args = {
"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty", "num_beams", "do_sample",
"top_k", "top_p", "temperature", "min_new_tokens", "length_penalty", "early_stopping", "n_keep",
"n_discard", "shift_roped_k", "batch_size", "pad_token", "memory_dtype", "continuous_batching",
"max_request_num", "scratch_size_ratio", "return_prompt", "print_log", "init_cb"
}
self.cpp_server = self.module.ModelServer(reponse_function, model_path,
**_filter_model_args(valid_args, **server_kwargs))

def issueQuery(self, index, token_ids):
Expand All @@ -484,4 +516,5 @@ def issueQuery(self, index, token_ids):
def Empty(self):
return self.cpp_server.Empty()


__all__ = ["get_cpp_module", "Model", "ModelServer"]
2 changes: 1 addition & 1 deletion neural_speed/application/quant_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int main(int argc, char** argv) {
const std::string fname_inp = q_params.model_file;
const std::string fname_out = q_params.out_file;
ne_ftype ftype = quant_params_to_ftype(q_params);
printf("ne_ftype: %d\n", ftype);
printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
const int nthread = q_params.nthread;

const int64_t t_main_start_us = ne_time_us();
Expand Down
16 changes: 15 additions & 1 deletion neural_speed/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
"phi-msft": "phi"
}

llama3_vocab_size = 128256


def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
if model_hub == "modelscope":
Expand All @@ -37,11 +39,23 @@ def convert_model(model, outfile, outtype="f32", format="NE", model_hub="hugging
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
model_type = model_maps.get(config.model_type, config.model_type)

cmd = []
if use_quantized_model:
path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
else:
path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
cmd = []

if config.vocab_size == llama3_vocab_size:
path = Path(Path(__file__).parent.absolute(), "convert_llama3.py".format(model_type))
cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
cmd.extend([model])
cmd.extend(["--vocab-type", "bpe"])
print("cmd:", cmd)
subprocess.run(cmd)
return

cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
Expand Down
Loading
Loading