diff --git a/modules/loaders.py b/modules/loaders.py index ab10e0a4de..e9e52f6cc4 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -27,6 +27,10 @@ 'disable_exllama', 'transformers_info' ], + 'petals': [ + 'trust_remote_code', + 'gpu_split', + ], 'ExLlama_HF': [ 'gpu_split', 'max_seq_len', @@ -176,6 +180,39 @@ 'skip_special_tokens', 'auto_max_new_tokens', }, + 'petals': { + 'temperature', + 'top_p', + 'top_k', + 'typical_p', + 'epsilon_cutoff', + 'eta_cutoff', + 'tfs', + 'top_a', + 'repetition_penalty', + 'repetition_penalty_range', + 'encoder_repetition_penalty', + 'no_repeat_ngram_size', + 'min_length', + 'seed', + 'do_sample', + 'penalty_alpha', + 'num_beams', + 'length_penalty', + 'early_stopping', + 'mirostat_mode', + 'mirostat_tau', + 'mirostat_eta', + 'grammar_file_row', + 'grammar_string', + 'guidance_scale', + 'negative_prompt', + 'ban_eos_token', + 'custom_token_bans', + 'add_bos_token', + 'skip_special_tokens', + 'auto_max_new_tokens', + }, 'ExLlama_HF': { 'temperature', 'top_p', diff --git a/modules/models.py b/modules/models.py index 5bd9db74db..7999516f5e 100644 --- a/modules/models.py +++ b/modules/models.py @@ -2,6 +2,7 @@ import os import re import time +import json import traceback from pathlib import Path @@ -64,6 +65,7 @@ def load_model(model_name, loader=None): 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, 'AutoAWQ': AutoAWQ_loader, + 'petals': huggingface_loader, } if loader is None: @@ -99,12 +101,14 @@ def load_tokenizer(model_name, model): path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) - elif path_to_model.exists(): + else: + model_id = path_to_model if path_to_model.exists() else model_name + if shared.args.use_fast: logger.info('Loading the tokenizer with use_fast=True.') tokenizer = AutoTokenizer.from_pretrained( - path_to_model, + model_id, trust_remote_code=shared.args.trust_remote_code, use_fast=shared.args.use_fast ) @@ -113,17 +117,32 @@ def load_tokenizer(model_name, model): def huggingface_loader(model_name): + if shared.args.loader == "petals": + path_to_model = model_name + import logging + httpx_logger = logging.getLogger('httpx') + httpx_logger.setLevel(logging.WARNING) + else: + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') - path_to_model = Path(f'{shared.args.model_dir}/{model_name}') params = { 'low_cpu_mem_usage': True, 'trust_remote_code': shared.args.trust_remote_code, 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16 } + + if shared.args.loader == "petals" and shared.args.gpu_split: + model_config = json.loads(shared.args.gpu_split) + for key in model_config.keys(): + params[key] = model_config[key] + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code']) if 'chatglm' in model_name.lower(): LoaderClass = AutoModel + elif shared.args.loader == "petals": + from petals import AutoDistributedModelForCausalLM + LoaderClass = AutoDistributedModelForCausalLM else: if config.to_dict().get('is_encoder_decoder', False): LoaderClass = AutoModelForSeq2SeqLM @@ -131,6 +150,10 @@ def huggingface_loader(model_name): else: LoaderClass = AutoModelForCausalLM + if not any((shared.args.cpu, shared.args.deepspeed, torch.cuda.is_available(), torch.backends.mps.is_available())): + logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.") + shared.args.cpu = True + # Load the model in simple 16-bit mode by default if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama]): model = LoaderClass.from_pretrained(path_to_model, **params) @@ -149,10 +172,6 @@ def huggingface_loader(model_name): # Load with quantization and/or offloading else: - if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())): - logger.warning('torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.') - shared.args.cpu = True - if shared.args.cpu: params['torch_dtype'] = torch.float32 else: diff --git a/modules/shared.py b/modules/shared.py index 427d923065..c091a04e17 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -234,6 +234,8 @@ def fix_loader_name(name): return 'ctransformers' elif name in ['autoawq', 'awq', 'auto-awq']: return 'AutoAWQ' + else: + return name def add_extension(name): diff --git a/server.py b/server.py index 4218967fc7..8eecdaa095 100644 --- a/server.py +++ b/server.py @@ -152,7 +152,8 @@ def create_interface(): auth=auth or None, ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True, ssl_keyfile=shared.args.ssl_keyfile, - ssl_certfile=shared.args.ssl_certfile + ssl_certfile=shared.args.ssl_certfile, + debug=True if gr.utils.colab_check() else False )