From f63a4f7edc05c82a6c35c515713fa9bb0032eaaf Mon Sep 17 00:00:00 2001
From: Mathnerd314 <mathnerd314.gph+hs@gmail.com>
Date: Fri, 1 Sep 2023 18:33:01 -0600
Subject: [PATCH 1/3] colab notebook: better error messages

---
 server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/server.py b/server.py
index 4218967fc7..8eecdaa095 100644
--- a/server.py
+++ b/server.py
@@ -152,7 +152,8 @@ def create_interface():
             auth=auth or None,
             ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True,
             ssl_keyfile=shared.args.ssl_keyfile,
-            ssl_certfile=shared.args.ssl_certfile
+            ssl_certfile=shared.args.ssl_certfile,
+            debug=True if gr.utils.colab_check() else False
         )
 
 

From fe1acbcf99ac148cb61a1f6d48acbd7b16524070 Mon Sep 17 00:00:00 2001
From: Mathnerd314 <mathnerd314.gph+hs@gmail.com>
Date: Fri, 1 Sep 2023 18:33:28 -0600
Subject: [PATCH 2/3] petals support

---
 modules/loaders.py | 36 ++++++++++++++++++++++++++++++++++++
 modules/models.py  | 22 +++++++++++++++++++---
 modules/shared.py  |  2 ++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index ab10e0a4de..f87da84c34 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -27,6 +27,9 @@
         'disable_exllama',
         'transformers_info'
     ],
+    'petals': [
+        'trust_remote_code',
+    ],
     'ExLlama_HF': [
         'gpu_split',
         'max_seq_len',
@@ -176,6 +179,39 @@
         'skip_special_tokens',
         'auto_max_new_tokens',
     },
+    'petals': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'penalty_alpha',
+        'num_beams',
+        'length_penalty',
+        'early_stopping',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
     'ExLlama_HF': {
         'temperature',
         'top_p',
diff --git a/modules/models.py b/modules/models.py
index 5bd9db74db..9a89554226 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -64,6 +64,7 @@ def load_model(model_name, loader=None):
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'ctransformers': ctransformers_loader,
         'AutoAWQ': AutoAWQ_loader,
+        'petals': huggingface_loader,
     }
 
     if loader is None:
@@ -99,12 +100,14 @@ def load_tokenizer(model_name, model):
     path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
-    elif path_to_model.exists():
+    else:
+        model_id = path_to_model if path_to_model.exists() else model_name
+
         if shared.args.use_fast:
             logger.info('Loading the tokenizer with use_fast=True.')
 
         tokenizer = AutoTokenizer.from_pretrained(
-            path_to_model,
+            model_id,
             trust_remote_code=shared.args.trust_remote_code,
             use_fast=shared.args.use_fast
         )
@@ -113,8 +116,14 @@ def load_tokenizer(model_name, model):
 
 
 def huggingface_loader(model_name):
+    if shared.args.loader == "petals":
+        path_to_model = model_name
+        import logging
+        httpx_logger = logging.getLogger('httpx')
+        httpx_logger.setLevel(logging.WARNING)
+    else:
+        path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
 
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     params = {
         'low_cpu_mem_usage': True,
         'trust_remote_code': shared.args.trust_remote_code,
@@ -124,6 +133,9 @@ def huggingface_loader(model_name):
 
     if 'chatglm' in model_name.lower():
         LoaderClass = AutoModel
+    elif shared.args.loader == "petals":
+        from petals import AutoDistributedModelForCausalLM
+        LoaderClass = AutoDistributedModelForCausalLM
     else:
         if config.to_dict().get('is_encoder_decoder', False):
             LoaderClass = AutoModelForSeq2SeqLM
@@ -131,6 +143,10 @@ def huggingface_loader(model_name):
         else:
             LoaderClass = AutoModelForCausalLM
 
+    if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
+        logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
+        shared.args.cpu = True
+
     # Load the model in simple 16-bit mode by default
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama]):
         model = LoaderClass.from_pretrained(path_to_model, **params)
diff --git a/modules/shared.py b/modules/shared.py
index 427d923065..c091a04e17 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -234,6 +234,8 @@ def fix_loader_name(name):
         return 'ctransformers'
     elif name in ['autoawq', 'awq', 'auto-awq']:
         return 'AutoAWQ'
+    else:
+        return name
 
 
 def add_extension(name):

From 50500f866cb3bc67c0fbd2bc686e08e3d925eeb2 Mon Sep 17 00:00:00 2001
From: Mathnerd314 <mathnerd314.gph+hs@gmail.com>
Date: Sat, 14 Oct 2023 20:32:14 -0600
Subject: [PATCH 3/3] use gpu_split parameter for petals config

---
 modules/loaders.py |  1 +
 modules/models.py  | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index f87da84c34..e9e52f6cc4 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -29,6 +29,7 @@
     ],
     'petals': [
         'trust_remote_code',
+        'gpu_split',
     ],
     'ExLlama_HF': [
         'gpu_split',
diff --git a/modules/models.py b/modules/models.py
index 9a89554226..7999516f5e 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -2,6 +2,7 @@
 import os
 import re
 import time
+import json
 import traceback
 from pathlib import Path
 
@@ -129,6 +130,12 @@ def huggingface_loader(model_name):
         'trust_remote_code': shared.args.trust_remote_code,
         'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16
     }
+
+    if shared.args.loader == "petals" and shared.args.gpu_split:
+        model_config = json.loads(shared.args.gpu_split)
+        for key in model_config.keys():
+            params[key] = model_config[key]
+
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
 
     if 'chatglm' in model_name.lower():
@@ -143,7 +150,7 @@ def huggingface_loader(model_name):
         else:
             LoaderClass = AutoModelForCausalLM
 
-    if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
+    if not any((shared.args.cpu, shared.args.deepspeed, torch.cuda.is_available(), torch.backends.mps.is_available())):
         logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
         shared.args.cpu = True
 
@@ -165,10 +172,6 @@ def huggingface_loader(model_name):
 
     # Load with quantization and/or offloading
     else:
-        if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
-            logger.warning('torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
-            shared.args.cpu = True
-
         if shared.args.cpu:
             params['torch_dtype'] = torch.float32
         else: