intel · VincyZhang · Apr 19, 2024 · Apr 17, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -24,6 +24,19 @@ Neural Speed supports the following models:
   </tr>
 </thead>
 <tbody>
+  <tr>
+    <td><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" target="_blank" rel="noopener noreferrer">Meta-Llama-3-8B-Instruct</a></td>
+    <td>✅</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td>✅</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td>Latest</td>
+    <td>8192</td>
+  </tr>
   <tr>
     <td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,

diff --git a/neural_speed/application/quant_model.cpp b/neural_speed/application/quant_model.cpp
@@ -53,7 +53,7 @@ int main(int argc, char** argv) {
   const std::string fname_inp = q_params.model_file;
   const std::string fname_out = q_params.out_file;
   ne_ftype ftype = quant_params_to_ftype(q_params);
-  printf("ne_ftype: %d\n", ftype);
+  printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
   const int nthread = q_params.nthread;
 
   const int64_t t_main_start_us = ne_time_us();

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -28,6 +28,8 @@
     "phi-msft": "phi"
 }
 
+llama3_vocab_size = 128256
+
 
 def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
     if model_hub == "modelscope":
@@ -37,11 +39,23 @@ def convert_model(model, outfile, outtype="f32", format="NE", model_hub="hugging
     config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 
+    cmd = []
     if use_quantized_model:
         path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
     else:
         path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
-    cmd = []
+
+    if config.vocab_size == llama3_vocab_size:
+        path = Path(Path(__file__).parent.absolute(), "convert_llama3.py".format(model_type))
+        cmd.extend(["python", path])
+        cmd.extend(["--outfile", outfile])
+        cmd.extend(["--outtype", outtype])
+        cmd.extend([model])
+        cmd.extend(["--vocab-type", "bpe"])
+        print("cmd:", cmd)
+        subprocess.run(cmd)
+        return
+
     cmd.extend(["python", path])
     cmd.extend(["--outfile", outfile])
     cmd.extend(["--outtype", outtype])