intel · VincyZhang · Apr 19, 2024 · Apr 17, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -24,6 +24,19 @@ Neural Speed supports the following models:
   </tr>
 </thead>
 <tbody>
+  <tr>
+    <td><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" target="_blank" rel="noopener noreferrer">Meta-Llama-3-8B-Instruct</a></td>
+    <td>✅</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td>✅</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td>Latest</td>
+    <td>8192</td>
+  </tr>
   <tr>
     <td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -20,54 +20,57 @@
 from neural_speed.convert import convert_model
 
 model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
+vocab_size_map = {"llama3": 128256}
 max_request_num_default = 1
 
+
 def _import_package(model_type):
-        if model_type == "gptj":
-            import neural_speed.gptj_cpp as cpp_model
-        elif model_type == "falcon":
-            import neural_speed.falcon_cpp as cpp_model
-        elif model_type == "gptneox":
-            import neural_speed.gptneox_cpp as cpp_model
-        elif model_type == "dolly":
-            import neural_speed.dolly_cpp as cpp_model
-        elif model_type == "llama" or model_type == "llama2":
-            import neural_speed.llama_cpp as cpp_model
-        elif model_type == "mpt":
-            import neural_speed.mpt_cpp as cpp_model
-        elif model_type == "gpt_bigcode" or model_type == "starcoder":
-            import neural_speed.starcoder_cpp as cpp_model
-        elif model_type == "opt":
-            import neural_speed.opt_cpp as cpp_model
-        elif model_type == "bloom":
-            import neural_speed.bloom_cpp as cpp_model
-        elif model_type == "chatglm":
-            import neural_speed.chatglm_cpp as cpp_model
-        elif model_type == "chatglm2" or model_type == "chatglm3":
-            import neural_speed.chatglm2_cpp as cpp_model
-        elif model_type == "baichuan":
-            import neural_speed.baichuan_cpp as cpp_model
-        elif model_type == "polyglot":
-            import neural_speed.polyglot_cpp as cpp_model
-        elif model_type == "qwen":
-            import neural_speed.qwen_cpp as cpp_model
-        elif model_type == "mistral":
-            import neural_speed.mistral_cpp as cpp_model
-        elif model_type == "qwen2":
-            import neural_speed.qwen_cpp as cpp_model
-        elif model_type == "phi":
-            import neural_speed.phi_cpp as cpp_model
-        elif model_type == "gemma":
-            import neural_speed.gemma_cpp as cpp_model
-        elif model_type == "stablelm":
-            import neural_speed.stablelm_cpp as cpp_model
-        elif model_type == "whisper":
-            import neural_speed.whisper_cpp as cpp_model
-        elif model_type == "mixtral":
-            import neural_speed.mixtral_cpp as cpp_model
-        else:
-            raise TypeError("Unsupported model type {}!".format(model_type))
-        return cpp_model
+    if model_type == "gptj":
+        import neural_speed.gptj_cpp as cpp_model
+    elif model_type == "falcon":
+        import neural_speed.falcon_cpp as cpp_model
+    elif model_type == "gptneox":
+        import neural_speed.gptneox_cpp as cpp_model
+    elif model_type == "dolly":
+        import neural_speed.dolly_cpp as cpp_model
+    elif model_type == "llama" or model_type == "llama2":
+        import neural_speed.llama_cpp as cpp_model
+    elif model_type == "mpt":
+        import neural_speed.mpt_cpp as cpp_model
+    elif model_type == "gpt_bigcode" or model_type == "starcoder":
+        import neural_speed.starcoder_cpp as cpp_model
+    elif model_type == "opt":
+        import neural_speed.opt_cpp as cpp_model
+    elif model_type == "bloom":
+        import neural_speed.bloom_cpp as cpp_model
+    elif model_type == "chatglm":
+        import neural_speed.chatglm_cpp as cpp_model
+    elif model_type == "chatglm2" or model_type == "chatglm3":
+        import neural_speed.chatglm2_cpp as cpp_model
+    elif model_type == "baichuan":
+        import neural_speed.baichuan_cpp as cpp_model
+    elif model_type == "polyglot":
+        import neural_speed.polyglot_cpp as cpp_model
+    elif model_type == "qwen":
+        import neural_speed.qwen_cpp as cpp_model
+    elif model_type == "mistral":
+        import neural_speed.mistral_cpp as cpp_model
+    elif model_type == "qwen2":
+        import neural_speed.qwen_cpp as cpp_model
+    elif model_type == "phi":
+        import neural_speed.phi_cpp as cpp_model
+    elif model_type == "gemma":
+        import neural_speed.gemma_cpp as cpp_model
+    elif model_type == "stablelm":
+        import neural_speed.stablelm_cpp as cpp_model
+    elif model_type == "whisper":
+        import neural_speed.whisper_cpp as cpp_model
+    elif model_type == "mixtral":
+        import neural_speed.mixtral_cpp as cpp_model
+    else:
+        raise TypeError("Unsupported model type {}!".format(model_type))
+    return cpp_model
+
 
 def _get_model_config(model_name, model_hub="huggingface"):
     if model_hub == "modelscope":
@@ -78,6 +81,7 @@ def _get_model_config(model_name, model_hub="huggingface"):
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     return config
 
+
 def _get_model_type(model_config):
     model_type = model_maps.get(model_config.model_type, model_config.model_type)
     if model_type == "chatglm" and "chatglm2" in model_config._name_or_path:
@@ -98,21 +102,24 @@ def _get_model_type(model_config):
 
     return model_type
 
+
 def _filter_model_args(valid_args, **input_kwargs):
-        invalid_args = []
-        for k in input_kwargs.keys():
-            if k not in valid_args:
-                invalid_args.append(k)
-        for k in invalid_args:
-            input_kwargs.pop(k)
-        return input_kwargs
+    invalid_args = []
+    for k in input_kwargs.keys():
+        if k not in valid_args:
+            invalid_args.append(k)
+    for k in invalid_args:
+        input_kwargs.pop(k)
+    return input_kwargs
+
 
 def get_cpp_module(model_name, model_hub="huggingface"):
     model_config = _get_model_config(model_name, model_hub=model_hub)
     model_type = _get_model_type(model_config)
     cpp_module = _import_package(model_type)
     return cpp_module
 
+
 class Model:
 
     def __init__(self):
@@ -123,6 +130,7 @@ def __init__(self):
         self.generate_round = 0
         self.max_request_num = -1
         self.reinit_from_bin = False
+        self.tokenizer = None
 
     def init(self,
              model_name,
@@ -140,6 +148,11 @@ def init(self,
         self.config = _get_model_config(model_name, model_hub=model_hub)
         model_type = _get_model_type(self.config)
         self.model_type = model_type
+        if self.__check_llama3():
+            print("The model_type: Llama3.")
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
         if self.module is None:
             self.module = _import_package(model_type)
 
@@ -165,8 +178,8 @@ def init(self,
 
             quant_desc = model_config['quantization_config'].get("quant_method", None)
             if quant_desc is None:
-               print("Error: No quant_method info in model config...")
-               exit(0)
+                print("Error: No quant_method info in model config...")
+                exit(0)
         quant_bin = "{}/ne_{}_q_{}.bin".format(output_path, model_type, quant_desc)
 
         if not use_quant:
@@ -294,11 +307,12 @@ def get_scratch_size_ratio(size):
                         else:
                             generate_kwargs["scratch_size_ratio"] = 35
 
-        valid_args = {"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty",
-                      "num_beams", "do_sample", "top_k", "top_p", "temperature", "min_new_tokens",
-                      "length_penalty", "early_stopping", "n_keep", "n_discard", "shift_roped_k",
-                      "batch_size","pad_token", "memory_dtype", "continuous_batching", "max_request_num",
-                      "scratch_size_ratio"}
+        valid_args = {
+            "max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty", "num_beams", "do_sample",
+            "top_k", "top_p", "temperature", "min_new_tokens", "length_penalty", "early_stopping", "n_keep",
+            "n_discard", "shift_roped_k", "batch_size", "pad_token", "memory_dtype", "continuous_batching",
+            "max_request_num", "scratch_size_ratio"
+        }
         self.model.init_model(model_path, **_filter_model_args(valid_args, **generate_kwargs))
 
     def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
@@ -314,6 +328,10 @@ def generate(self,
                  stopping_criteria=None,
                  **generate_kwargs):
         batch_size = input_ids.shape[0]
+        if self.__check_llama3():
+            if int(input_ids[0][0]) != self.tokenizer.bos_token_id:
+                bos_token_tensor = torch.tensor([[self.tokenizer.bos_token_id]])
+                input_ids = torch.cat((bos_token_tensor, input_ids), dim=1)
 
         max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
         self.reinit_from_bin = False
@@ -366,7 +384,7 @@ def generate(self,
             if stopping_criteria is not None:
                 if stopping_criteria(torch.tensor(ret), None):
                     break
-            elif ret[0][-1] == self.__get_eos_id() or \
+            elif ret[0][-1] == self.__get_eos_id() or ret[0][-1] == self.__get_special_eos_id() or \
                     (max_new_tokens != -1 and out_count >= max_new_tokens):
                 break
         if streamer:
@@ -380,6 +398,19 @@ def generate(self,
     def is_token_end(self):
         return self.model.is_token_end()
 
+    def __check_llama3(self):
+        if self.model_type == "llama" and self.config.vocab_size == vocab_size_map["llama3"]:
+            return True
+        return False
+
+    def __get_special_eos_id(self):
+        if self.__check_llama3():
+            eot_id = self.tokenizer("<|eot_id|>")["input_ids"][0]
+
+            return eot_id
+
+        return self.model.get_eos_id()
+
     def __get_eos_id(self):
         return self.model.get_eos_id()
 
@@ -438,7 +469,7 @@ def _cont_batching_input(self, input_ids, pad_token_id=None, padding_side="left"
             if padding_side == "left":
                 del input_list[il][0:count]
             elif padding_side == "right":
-                del input_list[il][len(input_list[il]) - count :]
+                del input_list[il][len(input_list[il]) - count:]
             else:
                 raise ValueError("padding_side must be 'left' or 'right'.")
             assert input_list[il] != [], "there are all pad tokens in batch {}.".format(il)
@@ -465,17 +496,18 @@ def _get_model_input_list(self, input_ids, **kwargs):
 
 
 class ModelServer:
+
     def __init__(self, model_name, reponse_function, model_path, **server_kwargs):
         if not os.path.exists(model_path):
             raise ValueError("model file {} does not exist.".format(model_path))
         self.module = get_cpp_module(model_name)
-        valid_args = {"max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty",
-                      "num_beams", "do_sample", "top_k", "top_p", "temperature", "min_new_tokens",
-                      "length_penalty", "early_stopping", "n_keep", "n_discard", "shift_roped_k",
-                      "batch_size","pad_token", "memory_dtype", "continuous_batching", "max_request_num",
-                      "scratch_size_ratio", "return_prompt", "print_log", "init_cb"}
-        self.cpp_server = self.module.ModelServer(reponse_function,
-                                                  model_path,
+        valid_args = {
+            "max_new_tokens", "n_batch", "ctx_size", "seed", "threads", "repetition_penalty", "num_beams", "do_sample",
+            "top_k", "top_p", "temperature", "min_new_tokens", "length_penalty", "early_stopping", "n_keep",
+            "n_discard", "shift_roped_k", "batch_size", "pad_token", "memory_dtype", "continuous_batching",
+            "max_request_num", "scratch_size_ratio", "return_prompt", "print_log", "init_cb"
+        }
+        self.cpp_server = self.module.ModelServer(reponse_function, model_path,
                                                   **_filter_model_args(valid_args, **server_kwargs))
 
     def issueQuery(self, index, token_ids):
@@ -484,4 +516,5 @@ def issueQuery(self, index, token_ids):
     def Empty(self):
         return self.cpp_server.Empty()
 
+
 __all__ = ["get_cpp_module", "Model", "ModelServer"]
diff --git a/neural_speed/application/quant_model.cpp b/neural_speed/application/quant_model.cpp
@@ -53,7 +53,7 @@ int main(int argc, char** argv) {
   const std::string fname_inp = q_params.model_file;
   const std::string fname_out = q_params.out_file;
   ne_ftype ftype = quant_params_to_ftype(q_params);
-  printf("ne_ftype: %d\n", ftype);
+  printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
   const int nthread = q_params.nthread;
 
   const int64_t t_main_start_us = ne_time_us();

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -28,6 +28,8 @@
     "phi-msft": "phi"
 }
 
+llama3_vocab_size = 128256
+
 
 def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
     if model_hub == "modelscope":
@@ -37,11 +39,23 @@ def convert_model(model, outfile, outtype="f32", format="NE", model_hub="hugging
     config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 
+    cmd = []
     if use_quantized_model:
         path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
     else:
         path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
-    cmd = []
+
+    if config.vocab_size == llama3_vocab_size:
+        path = Path(Path(__file__).parent.absolute(), "convert_llama3.py".format(model_type))
+        cmd.extend(["python", path])
+        cmd.extend(["--outfile", outfile])
+        cmd.extend(["--outtype", outtype])
+        cmd.extend([model])
+        cmd.extend(["--vocab-type", "bpe"])
+        print("cmd:", cmd)
+        subprocess.run(cmd)
+        return
+
     cmd.extend(["python", path])
     cmd.extend(["--outfile", outfile])
     cmd.extend(["--outtype", outtype])