intel · VincyZhang · Apr 19, 2024 · Apr 17, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/neural_speed/application/quant_model.cpp b/neural_speed/application/quant_model.cpp
@@ -53,7 +53,7 @@ int main(int argc, char** argv) {
   const std::string fname_inp = q_params.model_file;
   const std::string fname_out = q_params.out_file;
   ne_ftype ftype = quant_params_to_ftype(q_params);
-  printf("ne_ftype: %d\n", ftype);
+  printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
   const int nthread = q_params.nthread;
 
   const int64_t t_main_start_us = ne_time_us();

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -28,6 +28,8 @@
     "phi-msft": "phi"
 }
 
+llama3_vocab_size = 128256
+
 
 def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
     if model_hub == "modelscope":
@@ -37,11 +39,23 @@ def convert_model(model, outfile, outtype="f32", format="NE", model_hub="hugging
     config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 
+    cmd = []
     if use_quantized_model:
         path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
     else:
         path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
-    cmd = []
+
+    if config.vocab_size == llama3_vocab_size:
+        path = Path(Path(__file__).parent.absolute(), "convert_llama3.py".format(model_type))
+        cmd.extend(["python", path])
+        cmd.extend(["--outfile", outfile])
+        cmd.extend(["--outtype", outtype])
+        cmd.extend([model])
+        cmd.extend(["--vocab-type", "bpe"])
+        print("cmd:", cmd)
+        subprocess.run(cmd)
+        return
+
     cmd.extend(["python", path])
     cmd.extend(["--outfile", outfile])
     cmd.extend(["--outtype", outtype])