Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[Model Enabling] llama3-8b-instruct-chat Enabling #225

Merged
merged 10 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ Neural Speed supports the following models:
</tr>
</thead>
<tbody>
<tr>
<td><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" target="_blank" rel="noopener noreferrer">Meta-Llama-3-8B-Instruct</a></td>
<td>✅</td>
<td></td>
<td></td>
<td></td>
<td>✅</td>
<td></td>
<td></td>
<td></td>
<td>Latest</td>
<td>8192</td>
</tr>
<tr>
<td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
<a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/application/quant_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int main(int argc, char** argv) {
const std::string fname_inp = q_params.model_file;
const std::string fname_out = q_params.out_file;
ne_ftype ftype = quant_params_to_ftype(q_params);
printf("ne_ftype: %d\n", ftype);
printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
const int nthread = q_params.nthread;

const int64_t t_main_start_us = ne_time_us();
Expand Down
16 changes: 15 additions & 1 deletion neural_speed/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
"phi-msft": "phi"
}

llama3_vocab_size = 128256


def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
if model_hub == "modelscope":
Expand All @@ -37,11 +39,23 @@ def convert_model(model, outfile, outtype="f32", format="NE", model_hub="hugging
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
model_type = model_maps.get(config.model_type, config.model_type)

cmd = []
if use_quantized_model:
path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
else:
path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
cmd = []

if config.vocab_size == llama3_vocab_size:
path = Path(Path(__file__).parent.absolute(), "convert_llama3.py".format(model_type))
cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
cmd.extend([model])
cmd.extend(["--vocab-type", "bpe"])
print("cmd:", cmd)
subprocess.run(cmd)
return

cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
Expand Down
Loading
Loading