diff --git a/.gitignore b/.gitignore index 0b52b1293c..60a40d6743 100644 --- a/.gitignore +++ b/.gitignore @@ -196,3 +196,6 @@ db_dir_UserData # Embeded browser cache and other apps/stable_diffusion/web/EBWebView/ + +# Llama2 tokenizer configs +llama2_tokenizer_configs/ diff --git a/apps/language_models/scripts/vicuna.py b/apps/language_models/scripts/vicuna.py index 0c5c2c335d..8961c21571 100644 --- a/apps/language_models/scripts/vicuna.py +++ b/apps/language_models/scripts/vicuna.py @@ -110,12 +110,6 @@ choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"], help="Specify which model to run.", ) -parser.add_argument( - "--hf_auth_token", - type=str, - default=None, - help="Specify your own huggingface authentication tokens for models like Llama2.", -) parser.add_argument( "--cache_vicunas", default=False, @@ -1217,7 +1211,6 @@ def __init__( self, model_name, hf_model_path="TheBloke/vicuna-7B-1.1-HF", - hf_auth_token: str = None, max_num_tokens=512, device="cpu", precision="int8", @@ -1237,11 +1230,6 @@ def __init__( max_num_tokens, extra_args_cmd=extra_args_cmd, ) - if "llama2" in self.model_name and hf_auth_token == None: - raise ValueError( - "HF auth token required. Pass it using --hf_auth_token flag." - ) - self.hf_auth_token = hf_auth_token if self.model_name == "llama2_7b": self.hf_model_path = "meta-llama/Llama-2-7b-chat-hf" elif self.model_name == "llama2_13b": @@ -1276,18 +1264,26 @@ def get_model_path(self, suffix="mlir"): ) def get_tokenizer(self): - kwargs = {"use_auth_token": self.hf_auth_token} - tokenizer = AutoTokenizer.from_pretrained( - self.hf_model_path, - use_fast=False, - **kwargs, - ) + local_tokenizer_path = Path(Path.cwd(), "llama2_tokenizer_configs") + local_tokenizer_path.mkdir(parents=True, exist_ok=True) + tokenizer_files_to_download = [ + "config.json", + "special_tokens_map.json", + "tokenizer.model", + "tokenizer_config.json", + ] + for tokenizer_file in tokenizer_files_to_download: + download_public_file( + f"gs://shark_tank/llama2_tokenizer/{tokenizer_file}", + Path(local_tokenizer_path, tokenizer_file), + single_file=True, + ) + tokenizer = AutoTokenizer.from_pretrained(str(local_tokenizer_path)) return tokenizer def get_src_model(self): kwargs = { "torch_dtype": torch.float, - "use_auth_token": self.hf_auth_token, } vicuna_model = AutoModelForCausalLM.from_pretrained( self.hf_model_path, @@ -1460,8 +1456,6 @@ def compile(self): self.hf_model_path, self.precision, self.weight_group_size, - self.model_name, - self.hf_auth_token, ) print(f"[DEBUG] generating torchscript graph") is_f16 = self.precision in ["fp16", "int4"] @@ -1553,24 +1547,18 @@ def compile(self): self.hf_model_path, self.precision, self.weight_group_size, - self.model_name, - self.hf_auth_token, ) elif self.model_name == "llama2_70b": model = SecondVicuna70B( self.hf_model_path, self.precision, self.weight_group_size, - self.model_name, - self.hf_auth_token, ) else: model = SecondVicuna7B( self.hf_model_path, self.precision, self.weight_group_size, - self.model_name, - self.hf_auth_token, ) print(f"[DEBUG] generating torchscript graph") is_f16 = self.precision in ["fp16", "int4"] @@ -1809,7 +1797,6 @@ def create_prompt(model_name, history): ) vic = UnshardedVicuna( model_name=args.model_name, - hf_auth_token=args.hf_auth_token, device=args.device, precision=args.precision, vicuna_mlir_path=vic_mlir_path, diff --git a/apps/language_models/src/model_wrappers/vicuna_model.py b/apps/language_models/src/model_wrappers/vicuna_model.py index e857ef9697..f4b98ec5f7 100644 --- a/apps/language_models/src/model_wrappers/vicuna_model.py +++ b/apps/language_models/src/model_wrappers/vicuna_model.py @@ -8,13 +8,9 @@ def __init__( model_path, precision="fp32", weight_group_size=128, - model_name="vicuna", - hf_auth_token: str = None, ): super().__init__() kwargs = {"torch_dtype": torch.float32} - if "llama2" in model_name: - kwargs["use_auth_token"] = hf_auth_token self.model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **kwargs ) @@ -57,13 +53,9 @@ def __init__( model_path, precision="fp32", weight_group_size=128, - model_name="vicuna", - hf_auth_token: str = None, ): super().__init__() kwargs = {"torch_dtype": torch.float32} - if "llama2" in model_name: - kwargs["use_auth_token"] = hf_auth_token self.model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **kwargs ) @@ -303,13 +295,9 @@ def __init__( model_path, precision="int8", weight_group_size=128, - model_name="vicuna", - hf_auth_token: str = None, ): super().__init__() kwargs = {"torch_dtype": torch.float32} - if "llama2" in model_name: - kwargs["use_auth_token"] = hf_auth_token self.model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **kwargs ) @@ -596,13 +584,9 @@ def __init__( model_path, precision="fp32", weight_group_size=128, - model_name="vicuna", - hf_auth_token: str = None, ): super().__init__() kwargs = {"torch_dtype": torch.float32} - if "llama2" in model_name: - kwargs["use_auth_token"] = hf_auth_token self.model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **kwargs ) diff --git a/apps/stable_diffusion/web/ui/stablelm_ui.py b/apps/stable_diffusion/web/ui/stablelm_ui.py index 1ad9dd34f3..c8f3482887 100644 --- a/apps/stable_diffusion/web/ui/stablelm_ui.py +++ b/apps/stable_diffusion/web/ui/stablelm_ui.py @@ -186,7 +186,6 @@ def chat( vicuna_model = UnshardedVicuna( model_name, hf_model_path=model_path, - hf_auth_token=args.hf_auth_token, device=device, precision=precision, max_num_tokens=max_toks,